From dea8eefd6214e3ad5b54795fa958ab721d58710c Mon Sep 17 00:00:00 2001
From: "kaf24@scramble.cl.cam.ac.uk" <kaf24@scramble.cl.cam.ac.uk>
Date: Sat, 31 Jan 2004 19:45:13 +0000
Subject: [PATCH] bitkeeper revision 1.699 (401c05c9TV2zsaZ_e3zpy-zaKxCetw)

timer.c, timer.h, sched.h:
  new file
Many files:
  Rolf's new timer interface, plus various cleanups.
---
 .rootkeys                                     |   3 +
 docs/interface.tex                            |  69 +-
 extras/mini-os/h/hypervisor.h                 |  34 +-
 extras/mini-os/time.c                         |  40 +-
 tools/misc/Makefile                           |   4 +-
 tools/misc/xen_read_console.c                 |  11 +-
 tools/xc/lib/Makefile                         |   8 +-
 xen/arch/i386/entry.S                         |   1 +
 xen/arch/i386/time.c                          |   6 -
 xen/common/ac_timer.c                         |   6 -
 xen/common/keyhandler.c                       |  44 +-
 xen/common/schedule.c                         | 262 +++--
 xen/drivers/block/xen_vbd.c                   |   2 +-
 xen/include/hypervisor-ifs/hypervisor-if.h    |  32 +-
 xen/include/xeno/sched.h                      |  40 +-
 xen/net/dev.c                                 |  11 +
 xenolinux-2.4.24-sparse/arch/xeno/config.in   |   4 +-
 xenolinux-2.4.24-sparse/arch/xeno/defconfig   |   5 +-
 .../arch/xeno/drivers/network/network.c       |  13 +-
 .../arch/xeno/kernel/process.c                |  30 +-
 .../arch/xeno/kernel/time.c                   | 155 ++-
 .../include/asm-xeno/hypervisor.h             |  24 +
 xenolinux-2.4.24-sparse/include/linux/sched.h | 966 +++++++++++++++++
 xenolinux-2.4.24-sparse/include/linux/timer.h |  77 ++
 xenolinux-2.4.24-sparse/kernel/panic.c        |   3 +-
 xenolinux-2.4.24-sparse/kernel/timer.c        | 968 ++++++++++++++++++
 26 files changed, 2588 insertions(+), 230 deletions(-)
 create mode 100644 xenolinux-2.4.24-sparse/include/linux/sched.h
 create mode 100644 xenolinux-2.4.24-sparse/include/linux/timer.h
 create mode 100644 xenolinux-2.4.24-sparse/kernel/timer.c

diff --git a/.rootkeys b/.rootkeys
index 65a76aac8b..312d4df82a 100644
--- a/.rootkeys
+++ b/.rootkeys
@@ -581,11 +581,14 @@
 3f689063nhrIRsMMZjZxMFk7iEINqQ xenolinux-2.4.24-sparse/include/asm-xeno/xeno_proc.h
 3f056927gMHl7mWB89rb73JahbhQIA xenolinux-2.4.24-sparse/include/linux/blk.h
 3e5a4e68WLX3B8owTvktP3HHOtznPQ xenolinux-2.4.24-sparse/include/linux/major.h
+401c0590D_kwJDU59X8NyvqSv_Cl2A xenolinux-2.4.24-sparse/include/linux/sched.h
 3e5a4e686V0nioX2ZpFf056sgvdiQw xenolinux-2.4.24-sparse/include/linux/sunrpc/debug.h
+401c0592pLrp_aCbQRo9GXiYQQaVVA xenolinux-2.4.24-sparse/include/linux/timer.h
 3e5a4e68W_hpMlM3u_-QOKMp3gzcwQ xenolinux-2.4.24-sparse/init/do_mounts.c
 3e5a4e68TJJavrunYwTAnLRSBxSYqQ xenolinux-2.4.24-sparse/kernel/panic.c
 3f1056a9LXNTgSzITNh1mb-MIKV1Ng xenolinux-2.4.24-sparse/kernel/printk.c
 3f9d4b44247udoqWEgFkaHiWv6Uvyg xenolinux-2.4.24-sparse/kernel/time.c
+401c059bjLBFYHRD4Py2uM3eA1D4zQ xenolinux-2.4.24-sparse/kernel/timer.c
 3eba8f878XjouY21EkQBXwYBsPsipQ xenolinux-2.4.24-sparse/lndir-rel
 3e6e7c1efbQe93xCvOpOVCnXTMmQ5w xenolinux-2.4.24-sparse/mkbuildtree
 3e5a4e68GxCIaFH4sy01v1wjapetaA xenolinux-2.4.24-sparse/mm/memory.c
diff --git a/docs/interface.tex b/docs/interface.tex
index ac942658db..1c4ca937d7 100644
--- a/docs/interface.tex
+++ b/docs/interface.tex
@@ -117,18 +117,20 @@ time.
 
 
 \section{Cycle counter time}
-This provides the finest-grained, free-running time reference, with the approximate
-frequency being publicly accessible. The cycle counter time is used to accurately
-extrapolate the other time references. On SMP machines it is currently assumed
-that the cycle counter time is synchronised between CPUs. The current x86-based
-implementation achieves this within inter-CPU communication latencies.
+This provides the finest-grained, free-running time reference, with the
+approximate frequency being publicly accessible. The cycle counter time is
+used to accurately extrapolate the other time references. On SMP machines
+it is currently assumed that the cycle counter time is synchronised between
+CPUs. The current x86-based implementation achieves this within inter-CPU
+communication latencies.
 
 \section{System time}
-This is a 64-bit value containing the nanoseconds elapsed since boot time. Unlike
-cycle counter time, system time accurately reflects the passage of real time, i.e.
-it is adjusted several times a second for timer drift. This is done by running an
-NTP client in {\it domain0} on behalf of the machine, feeding updates to the 
-hypervisor. Intermediate values can be extrapolated using the cycle counter. 
+This is a 64-bit value containing the nanoseconds elapsed since boot
+time. Unlike cycle counter time, system time accurately reflects the
+passage of real time, i.e.  it is adjusted several times a second for timer
+drift. This is done by running an NTP client in {\it domain0} on behalf of
+the machine, feeding updates to the hypervisor. Intermediate values can be
+extrapolated using the cycle counter.
 
 \section{Wall clock time}
 This is the actual ``time of day'' Unix style struct timeval (i.e. seconds and
@@ -140,10 +142,39 @@ and remain perfectly in time.
 
 
 \section{Domain virtual time}
-This progresses at the same pace as cycle counter time, but only while a domain
-is executing. It stops while a domain is de-scheduled. Therefore the share of the 
-CPU that a domain receives is indicated by the rate at which its domain virtual
-time increases, relative to the rate at which cycle counter time does so.
+This progresses at the same pace as cycle counter time, but only while a
+domain is executing. It stops while a domain is de-scheduled. Therefore the
+share of the CPU that a domain receives is indicated by the rate at which
+its domain virtual time increases, relative to the rate at which cycle
+counter time does so.
+
+\section{Time interface}
+Xen exports some timestamps to guest operating systems through their shared
+info page. Timestamps are provided for system time and wall-clock time. Xen
+also provides the cycle counter values at the time of the last update
+allowing guests to calculate the current values. The cpu frequency and a
+scaling factor are provided for guests to convert cycle counter values to
+real time. Since all time stamps need to be updated and read
+\emph{atomically} two version numbers are also stored in the shared info
+page.
+
+Xen will ensure that the time stamps are updated frequently enough to avoid
+an overflow of the cycle counter values. Guest can check if its notion of
+time is up-to-date by comparing the version numbers.
+
+\section{Timer events}
+
+Xen maintains a periodic timer (currently with a 10ms period) which sends a
+timer event to the currently executing domain. This allows Guest OSes to
+keep track of the passing of time when executing. The scheduler also
+arranges for a newly activated domain to receive a timer event when
+scheduled so that the Guest OS can adjust to the passage of time while it
+has been inactive.
+
+In addition, Xen exports a hypercall interface to each domain which allows
+them to request a timer event send to them at the specified system
+time. Guest OSes may use this timer to implemented timeout values when they
+block.
 
 \chapter{Memory}
 
@@ -371,7 +402,15 @@ Notify hypervisor of updates to transmit and/or receive descriptor rings.
 Notify hypervisor that fpu registers needed to be save on context switch.
 
 \section{ sched\_op(unsigned long op)} 
-Request scheduling operation from hypervisor. The options are: yield, stop, and exit.
+Request scheduling operation from hypervisor. The options are: {\it yield},
+{\it block}, {\it stop}, and {\it exit}. {\it yield} keeps the calling
+domain run-able but may cause a reschedule if other domains are
+run-able. {\it block} removes the calling domain from the run queue and the
+domains sleeps until an event is delivered to it. {\it stop} and {\it exit}
+should be self-explanatory.
+
+\section{ set\_dom\_timer(dom\_timer\_arg\_t *timer\_arg)} 
+Request a timer event to be sent at the specified system time.
 
 \section{ dom0\_op(dom0\_op\_t *op)} 
 Administrative domain operations for domain management. The options are:
diff --git a/extras/mini-os/h/hypervisor.h b/extras/mini-os/h/hypervisor.h
index a4f5625692..92bb37cdd2 100644
--- a/extras/mini-os/h/hypervisor.h
+++ b/extras/mini-os/h/hypervisor.h
@@ -1,3 +1,10 @@
+/******************************************************************************
+ * hypervisor.h
+ * 
+ * Linux-specific hypervisor handling.
+ * 
+ * Copyright (c) 2002, K A Fraser
+ */
 
 #ifndef _HYPERVISOR_H_
 #define _HYPERVISOR_H_
@@ -135,6 +142,17 @@ static __inline__ int HYPERVISOR_yield(void)
     return ret;
 }
 
+static __inline__ int HYPERVISOR_block(void)
+{
+    int ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_sched_op),
+        "b" (SCHEDOP_block) );
+
+    return ret;
+}
+
 static __inline__ int HYPERVISOR_exit(void)
 {
     int ret;
@@ -146,13 +164,25 @@ static __inline__ int HYPERVISOR_exit(void)
     return ret;
 }
 
-static __inline__ int HYPERVISOR_stop(void)
+static __inline__ int HYPERVISOR_stop(unsigned long srec)
 {
     int ret;
+    /* NB. On suspend, control software expects a suspend record in %esi. */
     __asm__ __volatile__ (
         TRAP_INSTR
         : "=a" (ret) : "0" (__HYPERVISOR_sched_op),
-        "b" (SCHEDOP_stop) );
+        "b" (SCHEDOP_stop), "S" (srec) : "memory" );
+
+    return ret;
+}
+
+static __inline__ long HYPERVISOR_set_dom_timer(void *timer_arg)
+{
+    int ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_set_dom_timer),
+        "b" (timer_arg) : "memory" );
 
     return ret;
 }
diff --git a/extras/mini-os/time.c b/extras/mini-os/time.c
index 447e164987..12356b0a03 100644
--- a/extras/mini-os/time.c
+++ b/extras/mini-os/time.c
@@ -1,20 +1,14 @@
 /* -*-  Mode:C; c-basic-offset:4; tab-width:4 -*-
  ****************************************************************************
  * (C) 2003 - Rolf Neugebauer - Intel Research Cambridge
+ * (C) 2002-2003 - Keir Fraser - University of Cambridge 
  ****************************************************************************
  *
  *        File: time.c
- *      Author: Rolf Neugebauer (neugebar@dcs.gla.ac.uk)
- *     Changes: 
- *              
- *        Date: Jul 2003
- * 
- * Environment: Xen Minimal OS
+ *      Author: Rolf Neugebauer and Keir Fraser
+ *
  * Description: Simple time and timer functions
  *
- ****************************************************************************
- * $Id: c-insert.c,v 1.7 2002/11/08 16:04:34 rn Exp $
- ****************************************************************************
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to
  * deal in the Software without restriction, including without limitation the
@@ -105,6 +99,29 @@ static __inline__ unsigned long get_time_delta_usecs(void)
     return (unsigned long)delta;
 }
 
+s64 get_s_time (void)
+{
+    u64 u_delta;
+    s64 ret;
+
+ again:
+
+    u_delta = get_time_delta_usecs();
+    ret = shadow_system_time + (1000 * u_delta);
+
+    if ( unlikely(!TIME_VALUES_UP_TO_DATE) )
+    {
+        /*
+         * We may have blocked for a long time, rendering our calculations
+         * invalid (e.g. the time delta may have overflowed). Detect that
+         * and recalculate with fresh values.
+         */
+        get_time_values_from_xen();
+        goto again;
+    }
+
+    return ret;
+}
 
 void gettimeofday(struct timeval *tv)
 {
@@ -123,11 +140,16 @@ void gettimeofday(struct timeval *tv)
 }
 
 
+/*
+ * Just a dummy 
+ */
 static void timer_handler(int ev, struct pt_regs *regs)
 {
     static int i;
     struct timeval tv;
 
+    get_time_values_from_xen();
+
     i++;
     if (i >= 1000) {
         gettimeofday(&tv);
diff --git a/tools/misc/Makefile b/tools/misc/Makefile
index 19d25b4748..597e68da17 100644
--- a/tools/misc/Makefile
+++ b/tools/misc/Makefile
@@ -16,7 +16,7 @@ all: $(TARGETS)
 
 install: all
 	mkdir -p /usr/bin
-	cp -a $(INSTALL) /usr/bin
+	cp $(INSTALL) /usr/bin
 	chmod 755 /usr/bin/xen-mkdevnodes
 	chmod 755 /usr/bin/xen_nat_enable
 	chmod 755 /usr/bin/xen-clone
@@ -24,7 +24,7 @@ install: all
 
 dist: all
 	mkdir -p ../../../install/bin
-	cp -a $(INSTALL) ../../../install/bin
+	cp $(INSTALL) ../../../install/bin
 	chmod 755 ../../../install/bin/xen-mkdevnodes
 	chmod 755 ../../../install/bin/xen_nat_enable
 	chmod 755 ../../../install/bin/xen-clone
diff --git a/tools/misc/xen_read_console.c b/tools/misc/xen_read_console.c
index 766d24f6f3..1352de8a6f 100644
--- a/tools/misc/xen_read_console.c
+++ b/tools/misc/xen_read_console.c
@@ -11,9 +11,9 @@
 
 int main(void)
 {
-    unsigned char buf[208];
+    unsigned char buf[208], filtered[208];
     struct sockaddr_in addr, from;
-    int fromlen = sizeof(from);
+    int fromlen = sizeof(from), i, j;
     int len, fd = socket(PF_INET, SOCK_DGRAM, 0);
     
     if ( fd < 0 )
@@ -46,7 +46,12 @@ int main(void)
         if ( buf[len-1] != '\n' ) { buf[len] = '\n'; len++; }
         buf[len] = '\0';
 
-        printf("[%d] %s", ntohs(from.sin_port),buf);
+        for ( i = 0, j = 0; i < len; i++ )
+            if ( (buf[i] == '\n') || (buf[i] == '\0') ||
+                 ((buf[i] >= 32) && (buf[i] <= 126)) )
+                filtered[j++] = buf[i];
+
+        printf("[%d] %s", ntohs(from.sin_port), filtered);
 
         fromlen = sizeof(from);
     }
diff --git a/tools/xc/lib/Makefile b/tools/xc/lib/Makefile
index 2693372048..188478cd25 100644
--- a/tools/xc/lib/Makefile
+++ b/tools/xc/lib/Makefile
@@ -21,17 +21,17 @@ check-for-zlib:
 install: all
 	mkdir -p /usr/lib
 	mkdir -p /usr/include
-	cp -a $(LIB) /usr/lib
+	cp $(LIB) /usr/lib
 	chmod 755 /usr/lib/$(LIB)
-	cp -a xc.h /usr/include
+	cp xc.h /usr/include
 	chmod 644 /usr/include/xc.h
 
 dist: all
 	mkdir -p ../../../../install/lib
 	mkdir -p ../../../../install/include
-	cp -a $(LIB) ../../../../install/lib
+	cp $(LIB) ../../../../install/lib
 	chmod 755 ../../../../install/lib/$(LIB)
-	cp -a xc.h ../../../../install/include
+	cp xc.h ../../../../install/include
 	chmod 644 ../../../../install/include/xc.h
 
 clean:
diff --git a/xen/arch/i386/entry.S b/xen/arch/i386/entry.S
index 08824dd920..4a135e5212 100644
--- a/xen/arch/i386/entry.S
+++ b/xen/arch/i386/entry.S
@@ -713,6 +713,7 @@ ENTRY(hypervisor_call_table)
         .long SYMBOL_NAME(do_net_io_op)
         .long SYMBOL_NAME(do_fpu_taskswitch)
         .long SYMBOL_NAME(do_sched_op)
+        .long SYMBOL_NAME(do_set_timer_op)
         .long SYMBOL_NAME(do_dom0_op)
         .long SYMBOL_NAME(do_network_op)
         .long SYMBOL_NAME(do_block_io_op)
diff --git a/xen/arch/i386/time.c b/xen/arch/i386/time.c
index 1328f31fd4..8d328c34b6 100644
--- a/xen/arch/i386/time.c
+++ b/xen/arch/i386/time.c
@@ -37,12 +37,6 @@
 #include <asm/fixmap.h>
 #include <asm/mc146818rtc.h>
 
-#ifdef TIME_TRACE
-#define TRC(_x) _x
-#else
-#define TRC(_x)
-#endif
-
 extern rwlock_t xtime_lock;
 extern unsigned long wall_jiffies;
 
diff --git a/xen/common/ac_timer.c b/xen/common/ac_timer.c
index c3d2f51b37..4da998d45d 100644
--- a/xen/common/ac_timer.c
+++ b/xen/common/ac_timer.c
@@ -27,12 +27,6 @@
 #include <asm/system.h>
 #include <asm/desc.h>
 
-#ifdef AC_TIMER_TRACE
-#define TRC(_x) _x
-#else
-#define TRC(_x)
-#endif
-
 /*
  * We pull handlers off the timer list this far in future,
  * rather than reprogramming the time hardware.
diff --git a/xen/common/keyhandler.c b/xen/common/keyhandler.c
index c03aa908ab..3c92eb976f 100644
--- a/xen/common/keyhandler.c
+++ b/xen/common/keyhandler.c
@@ -1,5 +1,7 @@
+
 #include <xeno/keyhandler.h> 
 #include <xeno/reboot.h>
+#include <xeno/event.h>
 
 #define KEY_MAX 256
 #define STR_MAX  64
@@ -80,40 +82,48 @@ static void kill_dom0(u_char key, void *dev_id, struct pt_regs *regs)
 /* XXX SMH: this is keir's fault */
 static char *task_states[] = 
 { 
-    "Runnable", 
-    "Interruptible Sleep", 
-    "Uninterruptible Sleep", 
-    NULL, "Stopped", 
-    NULL, NULL, NULL, "Dying", 
+    "Runnable  ", 
+    "Int Sleep ", 
+    "UInt Sleep", 
+    NULL,
+    "Stopped   ", 
+    NULL,
+    NULL,
+    NULL,
+    "Dying     ", 
 }; 
 
 void do_task_queues(u_char key, void *dev_id, struct pt_regs *regs) 
 {
-    unsigned long       flags; 
+    unsigned long       flags, cpu_mask = 0; 
     struct task_struct *p; 
     shared_info_t      *s; 
+    s_time_t            now = NOW();
 
-    printk("'%c' pressed -> dumping task queues\n", key); 
+    printk("'%c' pressed -> dumping task queues (now=0x%X:%08X)\n", key,
+           (u32)(now>>32), (u32)now); 
 
     read_lock_irqsave(&tasklist_lock, flags); 
 
     p = &idle0_task;
     do {
         printk("Xen: DOM %d, CPU %d [has=%c], state = %s, "
-	       "hyp_events = %08x\n", 
-	       p->domain, p->processor, p->has_cpu ? 'T':'F', 
-	       task_states[p->state], p->hyp_events); 
-	s = p->shared_info; 
-	if( !is_idle_task(p) )
+               "hyp_events = %08x\n", 
+               p->domain, p->processor, p->has_cpu ? 'T':'F', 
+               task_states[p->state], p->hyp_events); 
+        s = p->shared_info; 
+        if( !is_idle_task(p) )
         {
-	    printk("Guest: events = %08lx, events_mask = %08lx\n", 
-		   s->events, s->events_mask); 
-	    printk("Notifying guest...\n"); 
-	    set_bit(_EVENT_DEBUG, &s->events); 
-	}
+            printk("Guest: events = %08lx, events_mask = %08lx\n", 
+                   s->events, s->events_mask); 
+            printk("Notifying guest...\n"); 
+            cpu_mask |= mark_guest_event(p, _EVENT_DEBUG);
+        }
     } while ( (p = p->next_task) != &idle0_task );
 
     read_unlock_irqrestore(&tasklist_lock, flags); 
+
+    guest_event_notify(cpu_mask);
 }
 
 extern void perfc_printall (u_char key, void *dev_id, struct pt_regs *regs);
diff --git a/xen/common/schedule.c b/xen/common/schedule.c
index 2b834d93e3..5352bbb6a6 100644
--- a/xen/common/schedule.c
+++ b/xen/common/schedule.c
@@ -5,7 +5,7 @@
  ****************************************************************************
  *
  *        File: common/schedule.c
- *      Author: Rolf Neugebar & Keir Fraser
+ *      Author: Rolf Neugebauer & Keir Fraser
  * 
  * Description: CPU scheduling
  *              implements A Borrowed Virtual Time scheduler.
@@ -24,16 +24,13 @@
 #include <xeno/timer.h>
 #include <xeno/perfc.h>
 
-#undef SCHEDULER_TRACE
-#ifdef SCHEDULER_TRACE
-#define TRC(_x) _x
-#else
-#define TRC(_x)
-#endif
+/*#define WAKEUP_HISTO*/
+/*#define BLOCKTIME_HISTO*/
 
-/*#define SCHED_HISTO*/
-#ifdef SCHED_HISTO
+#if defined(WAKEUP_HISTO)
 #define BUCKETS 31
+#elif defined(BLOCKTIME_HISTO)
+#define BUCKETS 200
 #endif
 
 #define MCU            (s32)MICROSECS(100)    /* Minimum unit */
@@ -48,7 +45,7 @@ typedef struct schedule_data_st
     struct task_struct *idle;           /* idle task for this cpu */
     u32                 svt;            /* system virtual time. per CPU??? */
     struct ac_timer     s_timer;        /* scheduling timer  */
-#ifdef SCHED_HISTO
+#ifdef BUCKETS
     u32                 hist[BUCKETS];  /* for scheduler latency histogram */
 #endif
 } __cacheline_aligned schedule_data_t;
@@ -56,19 +53,25 @@ static schedule_data_t schedule_data[NR_CPUS];
 
 spinlock_t schedule_lock[NR_CPUS] __cacheline_aligned;
 
-/* Skanky periodic event to all guests. This must die in the next release! */
-static struct ac_timer v_timer; 
+/* Per-CPU periodic timer sends an event to the currently-executing domain. */
+static struct ac_timer t_timer[NR_CPUS]; 
 
 /*
- * Per-CPU timer to ensure that even guests with very long quantums get
+ * Per-CPU timer which ensures that even guests with very long quantums get
  * their time-of-day state updated often enough to avoid wrapping.
  */
 static struct ac_timer fallback_timer[NR_CPUS];
 
-static void virt_timer(unsigned long foo);
-static void dump_rqueue(struct list_head *queue, char *name);
-
+/* Various timer handlers. */
+static void s_timer_fn(unsigned long unused);
+static void t_timer_fn(unsigned long unused);
+static void dom_timer_fn(unsigned long data);
+static void fallback_timer_fn(unsigned long unused);
 
+/*
+ * Wrappers for run-queue management. Must be called with the schedule_lock
+ * held.
+ */
 static inline void __add_to_runqueue_head(struct task_struct * p)
 {    
     list_add(&p->run_list, &schedule_data[p->processor].runqueue);
@@ -93,6 +96,10 @@ static inline int __task_on_runqueue(struct task_struct *p)
 #define next_domain(p) \\
         list_entry((p)->run_list.next, struct task_struct, run_list)
 
+/*
+ * Calculate the effective virtual time for a domain. Take into account 
+ * warping limits
+ */
 static void __calc_evt(struct task_struct *p)
 {
     s_time_t now = NOW();
@@ -134,14 +141,21 @@ void sched_add_domain(struct task_struct *p)
     } 
     else 
     {
-        /* set avt end evt to system virtual time */
+        /* Set avt end evt to system virtual time. */
         p->avt         = schedule_data[p->processor].svt;
         p->evt         = schedule_data[p->processor].svt;
-        /* set some default values here */
+        /* Set some default values here. */
         p->warpback    = 0;
         p->warp        = 0;
         p->warpl       = 0;
         p->warpu       = 0;
+
+        /* Initialise the per-domain timer. */
+        init_ac_timer(&p->timer);
+        p->timer.cpu      =  p->processor;
+        p->timer.data     = (unsigned long)p;
+        p->timer.function = &dom_timer_fn;
+
     }
 }
 
@@ -187,7 +201,7 @@ void __wake_up(struct task_struct *p)
     p->warped    = NOW();
     __calc_evt(p);
 
-#ifdef SCHED_HISTO
+#ifdef WAKEUP_HISTO
     p->wokenup = NOW();
 #endif
 }
@@ -200,16 +214,31 @@ void wake_up(struct task_struct *p)
     spin_unlock_irqrestore(&schedule_lock[p->processor], flags);
 }
 
-/* Voluntarily yield the processor to another domain, until an event occurs. */
-long do_yield(void)
+/* 
+ * Block the currently-executing domain until a pertinent event occurs.
+ */
+static long do_block(void)
 {
+    set_bit(EVENTS_MASTER_ENABLE_BIT, &current->shared_info->events_mask);
     current->state = TASK_INTERRUPTIBLE;
-    current->warpback = 0; /* XXX should only do this when blocking */
+    current->warpback = 0; 
     __enter_scheduler();
     return 0;
 }
 
-/* Demultiplex scheduler-related hypercalls. */
+/*
+ * Voluntarily yield the processor for this allocation.
+ */
+static long do_yield(void)
+{
+    __enter_scheduler();
+    return 0;
+}
+
+
+/*
+ * Demultiplex scheduler-related hypercalls.
+ */
 long do_sched_op(unsigned long op)
 {
     long ret = 0;
@@ -223,14 +252,24 @@ long do_sched_op(unsigned long op)
         break;
     }
 
+    case SCHEDOP_block:
+    {
+        ret = do_block();
+        break;
+    }
+
     case SCHEDOP_exit:
     {
+        DPRINTK("DOM%d killed itself!\n", current->domain);
+        DPRINTK(" EIP == %08lx\n", get_execution_context()->eip);
         kill_domain();
         break;
     }
 
     case SCHEDOP_stop:
     {
+        DPRINTK("DOM%d stopped itself!\n", current->domain);
+        DPRINTK(" EIP == %08lx\n", get_execution_context()->eip);
         stop_domain();
         break;
     }
@@ -242,6 +281,23 @@ long do_sched_op(unsigned long op)
     return ret;
 }
 
+/* Per-domain one-shot-timer hypercall. */
+long do_set_timer_op(unsigned long timeout_hi, unsigned long timeout_lo)
+{
+    struct task_struct *p = current;
+
+    rem_ac_timer(&p->timer);
+    
+    if ( (timeout_hi != 0) || (timeout_lo != 0) )
+    {
+        p->timer.expires = ((s_time_t)timeout_hi<<32) | ((s_time_t)timeout_lo);
+        add_ac_timer(&p->timer);
+    }
+
+    return 0;
+}
+
+
 /* Control the scheduler. */
 long sched_bvtctl(unsigned long c_allow)
 {
@@ -330,7 +386,7 @@ asmlinkage void __enter_scheduler(void)
 {
     struct task_struct *prev = current, *next = NULL, *next_prime, *p;
     struct list_head   *tmp;
-    int                 this_cpu = prev->processor;
+    int                 cpu = prev->processor;
     s_time_t            now;
     s32                 r_time;     /* time for new dom to run */
     s32                 ranfor;     /* assume we never run longer than 2.1s! */
@@ -339,11 +395,11 @@ asmlinkage void __enter_scheduler(void)
 
     perfc_incrc(sched_run);
 
-    spin_lock_irq(&schedule_lock[this_cpu]);
+    spin_lock_irq(&schedule_lock[cpu]);
 
     now = NOW();
 
-    rem_ac_timer(&schedule_data[this_cpu].s_timer);
+    rem_ac_timer(&schedule_data[cpu].s_timer);
 
     ASSERT(!in_interrupt());
     ASSERT(__task_on_runqueue(prev));
@@ -374,21 +430,21 @@ asmlinkage void __enter_scheduler(void)
     clear_bit(_HYP_EVENT_NEED_RESCHED, &prev->hyp_events);
 
     /* We should at least have the idle task */
-    ASSERT(!list_empty(&schedule_data[this_cpu].runqueue));
+    ASSERT(!list_empty(&schedule_data[cpu].runqueue));
 
     /*
      * scan through the run queue and pick the task with the lowest evt
      * *and* the task the second lowest evt.
      * this code is O(n) but we expect n to be small.
      */
-    next       = schedule_data[this_cpu].idle;
+    next       = schedule_data[cpu].idle;
     next_prime = NULL;
 
     next_evt       = ~0U;
     next_prime_evt = ~0U;
     min_avt        = ~0U;
 
-    list_for_each ( tmp, &schedule_data[this_cpu].runqueue )
+    list_for_each ( tmp, &schedule_data[cpu].runqueue )
     {
         p = list_entry(tmp, struct task_struct, run_list);
         if ( p->evt < next_evt )
@@ -416,16 +472,16 @@ asmlinkage void __enter_scheduler(void)
 
     /* Update system virtual time. */
     if ( min_avt != ~0U )
-        schedule_data[this_cpu].svt = min_avt;
+        schedule_data[cpu].svt = min_avt;
 
     /* check for virtual time overrun on this cpu */
-    if ( schedule_data[this_cpu].svt >= 0xf0000000 )
+    if ( schedule_data[cpu].svt >= 0xf0000000 )
     {
         u_long t_flags; 
         write_lock_irqsave(&tasklist_lock, t_flags); 
         p = &idle0_task;
         do {
-            if ( (p->processor == this_cpu) && !is_idle_task(p) )
+            if ( (p->processor == cpu) && !is_idle_task(p) )
             {
                 p->evt -= 0xe0000000;
                 p->avt -= 0xe0000000;
@@ -433,7 +489,7 @@ asmlinkage void __enter_scheduler(void)
         } 
         while ( (p = p->next_task) != &idle0_task );
         write_unlock_irqrestore(&tasklist_lock, t_flags); 
-        schedule_data[this_cpu].svt -= 0xe0000000;
+        schedule_data[cpu].svt -= 0xe0000000;
     }
 
     /* work out time for next run through scheduler */
@@ -461,46 +517,43 @@ asmlinkage void __enter_scheduler(void)
  sched_done:
     ASSERT(r_time >= ctx_allow);
 
-#ifndef NDEBUG
-    if ( r_time < ctx_allow )
-    {
-        printk("[%02d]: %lx\n", this_cpu, (unsigned long)r_time);
-        dump_rqueue(&schedule_data[this_cpu].runqueue, "foo");
-    }
-#endif
-
     prev->has_cpu = 0;
     next->has_cpu = 1;
 
-    schedule_data[this_cpu].curr = next;
+    schedule_data[cpu].curr = next;
 
     next->lastschd = now;
 
     /* reprogramm the timer */
-    schedule_data[this_cpu].s_timer.expires  = now + r_time;
-    add_ac_timer(&schedule_data[this_cpu].s_timer);
+    schedule_data[cpu].s_timer.expires  = now + r_time;
+    add_ac_timer(&schedule_data[cpu].s_timer);
+
+    spin_unlock_irq(&schedule_lock[cpu]);
 
-    spin_unlock_irq(&schedule_lock[this_cpu]);
+    /* Ensure that the domain has an up-to-date time base. */
+    if ( !is_idle_task(next) )
+        update_dom_time(next->shared_info);
 
-    /* done, switch tasks */
     if ( unlikely(prev == next) )
-    {
-        /* We won't go through the normal tail, so do this by hand */
-        update_dom_time(prev->shared_info);
         return;
-    }
 
     perfc_incrc(sched_ctx);
-#ifdef SCHED_HISTO
+
+#if defined(WAKEUP_HISTO)
+    if ( !is_idle_task(next) && next->wokenup ) {
+        ulong diff = (ulong)(now - next->wokenup);
+        diff /= (ulong)MILLISECS(1);
+        if (diff <= BUCKETS-2)  schedule_data[cpu].hist[diff]++;
+        else                    schedule_data[cpu].hist[BUCKETS-1]++;
+    }
+    next->wokenup = (s_time_t)0;
+#elif defined(BLOCKTIME_HISTO)
+    prev->lastdeschd = now;
+    if ( !is_idle_task(next) )
     {
-        ulong diff; /* should fit in 32bits */
-        if (!is_idle_task(next) && next->wokenup) {
-            diff = (ulong)(now - next->wokenup);
-            diff /= (ulong)MILLISECS(1);
-            if (diff <= BUCKETS-2)  schedule_data[this_cpu].hist[diff]++;
-            else                    schedule_data[this_cpu].hist[BUCKETS-1]++;
-        }
-        next->wokenup = (s_time_t)0;
+        ulong diff = (ulong)((now - next->lastdeschd) / MILLISECS(10));
+        if (diff <= BUCKETS-2)  schedule_data[cpu].hist[diff]++;
+        else                    schedule_data[cpu].hist[BUCKETS-1]++;
     }
 #endif
 
@@ -509,8 +562,10 @@ asmlinkage void __enter_scheduler(void)
     if ( unlikely(prev->state == TASK_DYING) ) 
         put_task_struct(prev);
 
-    update_dom_time(next->shared_info);
-
+    /* Mark a timer event for the newly-scheduled domain. */
+    if ( !is_idle_task(next) )
+        set_bit(_EVENT_TIMER, &next->shared_info->events);
+    
     schedule_tail(next);
 
     BUG();
@@ -524,55 +579,57 @@ int idle_cpu(int cpu)
 }
 
 
-/* The scheduler timer. */
-static void sched_timer(unsigned long unused)
+/****************************************************************************
+ * Timers: the scheduler utilises a number of timers
+ * - s_timer: per CPU timer for preemption and scheduling decisions
+ * - t_timer: per CPU periodic timer to send timer interrupt to current dom
+ * - dom_timer: per domain timer to specifiy timeout values
+ * - fallback_timer: safeguard to ensure time is up to date
+ ****************************************************************************/
+
+/* The scheduler timer: force a run through the scheduler*/
+static void s_timer_fn(unsigned long unused)
 {
-    int                 cpu  = smp_processor_id();
-    struct task_struct *curr = schedule_data[cpu].curr;
-    /* cause a reschedule */
-    set_bit(_HYP_EVENT_NEED_RESCHED, &curr->hyp_events);
+    set_bit(_HYP_EVENT_NEED_RESCHED, &current->hyp_events);
     perfc_incrc(sched_irq);
 }
 
-/* The Domain virtual time timer */
-static void virt_timer(unsigned long unused)
+/* Periodic tick timer: send timer event to current domain*/
+static void t_timer_fn(unsigned long unused)
 {
-    unsigned long flags, cpu_mask = 0;
-    struct task_struct *p;
-    s_time_t now;
+    struct task_struct *p = current;
 
-    /* send virtual timer interrupt */
-    read_lock_irqsave(&tasklist_lock, flags);
-    p = &idle0_task;
-    do {
-        if ( is_idle_task(p) ) continue;
-        cpu_mask |= mark_guest_event(p, _EVENT_TIMER);
-    }
-    while ( (p = p->next_task) != &idle0_task );
-    read_unlock_irqrestore(&tasklist_lock, flags);
-    guest_event_notify(cpu_mask);
+    if ( !is_idle_task(p) ) 
+        set_bit(_EVENT_TIMER, &p->shared_info->events);
 
-    now = NOW();
-    v_timer.expires = now + MILLISECS(20);
-    add_ac_timer(&v_timer);
+    t_timer[p->processor].expires = NOW() + MILLISECS(10);
+    add_ac_timer(&t_timer[p->processor]);
+}
+
+/* Domain timer function, sends a virtual timer interrupt to domain */
+static void dom_timer_fn(unsigned long data)
+{
+    unsigned long cpu_mask = 0;
+    struct task_struct *p = (struct task_struct *)data;
+
+    cpu_mask |= mark_guest_event(p, _EVENT_TIMER);
+    guest_event_notify(cpu_mask);
 }
 
+
 /* Fallback timer to ensure guests get time updated 'often enough'. */
 static void fallback_timer_fn(unsigned long unused)
 {
     struct task_struct *p = current;
-    unsigned int cpu = p->processor;
 
     if ( !is_idle_task(p) )
         update_dom_time(p->shared_info);
 
-    fallback_timer[cpu].expires = NOW() + MILLISECS(500);
-    add_ac_timer(&fallback_timer[cpu]);
+    fallback_timer[p->processor].expires = NOW() + MILLISECS(500);
+    add_ac_timer(&fallback_timer[p->processor]);
 }
 
-/*
- * Initialise the data structures
- */
+/* Initialise the data structures. */
 void __init scheduler_init(void)
 {
     int i;
@@ -588,20 +645,20 @@ void __init scheduler_init(void)
         init_ac_timer(&schedule_data[i].s_timer);
         schedule_data[i].s_timer.cpu      = i;
         schedule_data[i].s_timer.data     = 2;
-        schedule_data[i].s_timer.function = &sched_timer;
+        schedule_data[i].s_timer.function = &s_timer_fn;
+
+        init_ac_timer(&t_timer[i]);
+        t_timer[i].cpu      = i;
+        t_timer[i].data     = 3;
+        t_timer[i].function = &t_timer_fn;
 
         init_ac_timer(&fallback_timer[i]);
         fallback_timer[i].cpu      = i;
-        fallback_timer[i].data     = 0;
+        fallback_timer[i].data     = 4;
         fallback_timer[i].function = &fallback_timer_fn;
     }
 
     schedule_data[0].idle = &idle0_task;
-
-    init_ac_timer(&v_timer);
-    v_timer.cpu      = 0;
-    v_timer.data     = 0;
-    v_timer.function = &virt_timer;
 }
 
 /*
@@ -612,10 +669,11 @@ void schedulers_start(void)
 {   
     printk("Start schedulers\n");
 
-    virt_timer(0);
+    s_timer_fn(0);
+    smp_call_function((void *)s_timer_fn, NULL, 1, 1);
 
-    sched_timer(0);
-    smp_call_function((void *)sched_timer, NULL, 1, 1);
+    t_timer_fn(0);
+    smp_call_function((void *)t_timer_fn, NULL, 1, 1);
 
     fallback_timer_fn(0);
     smp_call_function((void *)fallback_timer_fn, NULL, 1, 1);
@@ -668,7 +726,7 @@ void dump_runq(u_char key, void *dev_id, struct pt_regs *regs)
     return; 
 }
 
-#ifdef SCHED_HISTO
+#if defined(WAKEUP_HISTO) || defined(BLOCKTIME_HISTO)
 void print_sched_histo(u_char key, void *dev_id, struct pt_regs *regs)
 {
     int loop, i, j;
diff --git a/xen/drivers/block/xen_vbd.c b/xen/drivers/block/xen_vbd.c
index 5570baff94..8a42026dac 100644
--- a/xen/drivers/block/xen_vbd.c
+++ b/xen/drivers/block/xen_vbd.c
@@ -89,7 +89,7 @@ long vbd_create(vbd_create_t *create)
     if ( unlikely((p = find_domain_by_id(create->domain)) == NULL) )
     {
         DPRINTK("vbd_create attempted for non-existent domain %d\n", 
-                domain); 
+                create->domain); 
         return -EINVAL; 
     }
 
diff --git a/xen/include/hypervisor-ifs/hypervisor-if.h b/xen/include/hypervisor-ifs/hypervisor-if.h
index ef29e292cb..ac1ecfd6f8 100644
--- a/xen/include/hypervisor-ifs/hypervisor-if.h
+++ b/xen/include/hypervisor-ifs/hypervisor-if.h
@@ -49,18 +49,19 @@
 #define __HYPERVISOR_net_io_op             6
 #define __HYPERVISOR_fpu_taskswitch        7
 #define __HYPERVISOR_sched_op              8
-#define __HYPERVISOR_dom0_op               9
-#define __HYPERVISOR_network_op           10
-#define __HYPERVISOR_block_io_op          11
-#define __HYPERVISOR_set_debugreg         12
-#define __HYPERVISOR_get_debugreg         13
-#define __HYPERVISOR_update_descriptor    14
-#define __HYPERVISOR_set_fast_trap        15
-#define __HYPERVISOR_dom_mem_op           16
-#define __HYPERVISOR_multicall            17
-#define __HYPERVISOR_kbd_op               18
-#define __HYPERVISOR_update_va_mapping    19
-#define __HYPERVISOR_event_channel_op     20
+#define __HYPERVISOR_set_dom_timer         9
+#define __HYPERVISOR_dom0_op              10
+#define __HYPERVISOR_network_op           11
+#define __HYPERVISOR_block_io_op          12
+#define __HYPERVISOR_set_debugreg         13
+#define __HYPERVISOR_get_debugreg         14
+#define __HYPERVISOR_update_descriptor    15
+#define __HYPERVISOR_set_fast_trap        16
+#define __HYPERVISOR_dom_mem_op           17
+#define __HYPERVISOR_multicall            18
+#define __HYPERVISOR_kbd_op               19
+#define __HYPERVISOR_update_va_mapping    20
+#define __HYPERVISOR_event_channel_op     21
 
 /* And the trap vector is... */
 #define TRAP_INSTR "int $0x82"
@@ -161,9 +162,10 @@
 /*
  * SCHEDOP_* - Scheduler hypercall operations.
  */
-#define SCHEDOP_yield           0
-#define SCHEDOP_exit            1
-#define SCHEDOP_stop            2
+#define SCHEDOP_yield           0   /* Give up the CPU voluntarily.      */
+#define SCHEDOP_block           1   /* Block until an event is received. */
+#define SCHEDOP_exit            3   /* Exit and kill this domain.        */
+#define SCHEDOP_stop            4   /* Stop executing this domain.       */
 
 /*
  * EVTCHNOP_* - Event channel operations.
diff --git a/xen/include/xeno/sched.h b/xen/include/xeno/sched.h
index 2e9cd0c563..bf1850ca5a 100644
--- a/xen/include/xeno/sched.h
+++ b/xen/include/xeno/sched.h
@@ -100,25 +100,27 @@ struct task_struct
     unsigned int     tot_pages; /* number of pages currently possesed */
     unsigned int     max_pages; /* max number of pages that can be possesed */
 
-    /* scheduling */
+    /* Scheduling. */
     struct list_head run_list;
     int              has_cpu;
-    int state;                  /* current run state */
-    int cpupinned;              /* true if pinned to curent CPU */
-
-    s_time_t lastschd;              /* time this domain was last scheduled */
-    s_time_t cpu_time;              /* total CPU time received till now */
-    s_time_t wokenup;               /* time domain got woken up */
-
+    int              state;         /* current run state */
+    int              cpupinned;     /* true if pinned to curent CPU */
+    s_time_t         lastschd;      /* time this domain was last scheduled */
+    s_time_t         lastdeschd;    /* time this domain was last descheduled */
+    s_time_t         cpu_time;      /* total CPU time received till now */
+    s_time_t         wokenup;       /* time domain got woken up */
+    struct ac_timer  timer;         /* one-shot timer for timeout values */
+
+    /* BVT scheduler specific. */
     unsigned long mcu_advance;      /* inverse of weight */
-    u32  avt;                       /* actual virtual time */
-    u32  evt;                       /* effective virtual time */
-    int  warpback;                  /* warp?  */
-    long warp;                      /* virtual time warp */
-    long warpl;                     /* warp limit */
-    long warpu;                     /* unwarp time requirement */
-    s_time_t warped;                /* time it ran warped last time */
-    s_time_t uwarped;               /* time it ran unwarped last time */
+    u32           avt;              /* actual virtual time */
+    u32           evt;              /* effective virtual time */
+    int           warpback;         /* warp?  */
+    long          warp;             /* virtual time warp */
+    long          warpl;            /* warp limit */
+    long          warpu;            /* unwarp time requirement */
+    s_time_t      warped;           /* time it ran warped last time */
+    s_time_t      uwarped;          /* time it ran unwarped last time */
 
     /* Network I/O */
     net_vif_t *net_vif_list[MAX_DOMAIN_VIFS];
@@ -250,7 +252,6 @@ long sched_adjdom(int dom, unsigned long mcu_adv, unsigned long warp,
 void init_idle_task(void);
 void __wake_up(struct task_struct *p);
 void wake_up(struct task_struct *p);
-long do_yield(void);
 unsigned long __reschedule(struct task_struct *p);
 void reschedule(struct task_struct *p);
 
@@ -271,8 +272,9 @@ static inline long schedule_timeout(long timeout)
     return 0;
 }
 
-#define signal_pending(_p) ((_p)->hyp_events || \
-                            (_p)->shared_info->events)
+#define signal_pending(_p) \
+    ((_p)->hyp_events ||   \
+     ((_p)->shared_info->events & (_p)->shared_info->events_mask))
 
 void domain_init(void);
 
diff --git a/xen/net/dev.c b/xen/net/dev.c
index 936d40f04c..963a65fbfb 100644
--- a/xen/net/dev.c
+++ b/xen/net/dev.c
@@ -1972,6 +1972,16 @@ static int get_tx_bufs(net_vif_t *vif)
         }
         else if ( (target == VIF_PHYS) || IS_PRIV(p) )
         {
+            /*
+             * XXX HACK XXX: Our wildcard rule for domain-0 incorrectly puts 
+             * some 169.254.* (ie. link-local) packets on the wire unless we 
+             * include this explicit test. :-(
+             */
+            if ( (ntohs(*(unsigned short *)(g_data + 12)) == ETH_P_IP) &&
+                 ((ntohl(*(unsigned long *)(g_data + 26)) & 0xFFFF0000) == 
+                  0xA9FE0000) )
+                goto disallow_linklocal_packets;
+
             stx = &vif->tx_shadow_ring[MASK_NET_TX_IDX(j)];
             stx->id     = tx.id;
             stx->size   = tx.size;
@@ -1990,6 +2000,7 @@ static int get_tx_bufs(net_vif_t *vif)
         }
         else
         {
+        disallow_linklocal_packets:
             make_tx_response(vif, tx.id, RING_STATUS_DROPPED);
         }
 
diff --git a/xenolinux-2.4.24-sparse/arch/xeno/config.in b/xenolinux-2.4.24-sparse/arch/xeno/config.in
index 445b574a71..3f4736fd1f 100644
--- a/xenolinux-2.4.24-sparse/arch/xeno/config.in
+++ b/xenolinux-2.4.24-sparse/arch/xeno/config.in
@@ -13,9 +13,11 @@ define_bool CONFIG_SBUS n
 define_bool CONFIG_UID16 y
 
 mainmenu_option next_comment
-comment 'Privileged guest OS'
+comment 'XenoLinux'
 bool 'Support for privileged operations (domain 0)' CONFIG_XENO_PRIV
 endmenu
+# the IBM S/390 patch needs this.
+define_bool CONFIG_NO_IDLE_HZ y
 
 mainmenu_option next_comment
 comment 'Code maturity level options'
diff --git a/xenolinux-2.4.24-sparse/arch/xeno/defconfig b/xenolinux-2.4.24-sparse/arch/xeno/defconfig
index abef573aa7..3ba185a19b 100644
--- a/xenolinux-2.4.24-sparse/arch/xeno/defconfig
+++ b/xenolinux-2.4.24-sparse/arch/xeno/defconfig
@@ -8,9 +8,12 @@ CONFIG_ISA=y
 CONFIG_UID16=y
 
 #
-# Privileged guest OS
+# XenoLinux Options
 #
+# support for priviledged domains
 CONFIG_XENO_PRIV=y
+# On demand timer setting (taken from s390 patch set)
+CONFIG_NO_IDLE_HZ=y
 
 #
 # Code maturity level options
diff --git a/xenolinux-2.4.24-sparse/arch/xeno/drivers/network/network.c b/xenolinux-2.4.24-sparse/arch/xeno/drivers/network/network.c
index ac557a3c11..075acdf5af 100644
--- a/xenolinux-2.4.24-sparse/arch/xeno/drivers/network/network.c
+++ b/xenolinux-2.4.24-sparse/arch/xeno/drivers/network/network.c
@@ -81,15 +81,15 @@ static void _dbg_network_int(struct net_device *dev)
     if ( np->state == STATE_CLOSED )
         return;
     
-    printk(KERN_ALERT "tx_full = %d, tx_resp_cons = 0x%08x,"
-           " tx_req_prod = 0x%08x, tx_resp_prod = 0x%08x,"
-           " tx_event = 0x%08x, state=%d\n",
+    printk(KERN_ALERT "net: tx_full=%d, tx_resp_cons=0x%08x,"
+           " tx_req_prod=0x%08x\nnet: tx_resp_prod=0x%08x,"
+           " tx_event=0x%08x, state=%d\n",
            np->tx_full, np->tx_resp_cons, 
            np->net_idx->tx_req_prod, np->net_idx->tx_resp_prod, 
            np->net_idx->tx_event,
            test_bit(__LINK_STATE_XOFF, &dev->state));
-    printk(KERN_ALERT "rx_resp_cons = 0x%08x,"
-           " rx_req_prod = 0x%08x, rx_resp_prod = 0x%08x, rx_event = 0x%08x\n",
+    printk(KERN_ALERT "net: rx_resp_cons=0x%08x,"
+           " rx_req_prod=0x%08x\nnet: rx_resp_prod=0x%08x, rx_event=0x%08x\n",
            np->rx_resp_cons, np->net_idx->rx_req_prod,
            np->net_idx->rx_resp_prod, np->net_idx->rx_event);
 }
@@ -550,7 +550,8 @@ int __init init_module(void)
         goto fail;
     }
     
-    err = request_irq(_EVENT_DEBUG, dbg_network_int, 0, "debug", NULL);
+    err = request_irq(_EVENT_DEBUG, dbg_network_int, SA_SHIRQ, "net_dbg", 
+                      &dbg_network_int);
     if ( err )
         printk(KERN_WARNING "Non-fatal error -- no debug interrupt\n");
 
diff --git a/xenolinux-2.4.24-sparse/arch/xeno/kernel/process.c b/xenolinux-2.4.24-sparse/arch/xeno/kernel/process.c
index 3b17c7326c..ff64bccd4c 100644
--- a/xenolinux-2.4.24-sparse/arch/xeno/kernel/process.c
+++ b/xenolinux-2.4.24-sparse/arch/xeno/kernel/process.c
@@ -80,14 +80,36 @@ void enable_hlt(void)
  */
 void cpu_idle (void)
 {
-    /* endless idle loop with no priority at all */
+    extern int set_timeout_timer(void);
+    
+    /* Endless idle loop with no priority at all. */
     init_idle();
     current->nice = 20;
     current->counter = -100;
 
-    while (1) {
-        while (!current->need_resched)
-            HYPERVISOR_yield();
+    for ( ; ; )
+    {
+        while ( !current->need_resched )
+        {
+            __cli();
+            if ( current->need_resched )
+            {
+                /* The race-free check for events failed. */
+                __sti();
+                break;
+            }
+            else if ( set_timeout_timer() == 0 )
+            {
+                /* NB. Blocking reenable events in a race-free manner. */
+                HYPERVISOR_block();
+            }
+            else
+            {
+                /* No race here: yielding will get us the CPU again anyway. */
+                __sti();
+                HYPERVISOR_yield();
+            }
+        }
         schedule();
         check_pgt_cache();
     }
diff --git a/xenolinux-2.4.24-sparse/arch/xeno/kernel/time.c b/xenolinux-2.4.24-sparse/arch/xeno/kernel/time.c
index 1944e63c1c..bf43b6a99b 100644
--- a/xenolinux-2.4.24-sparse/arch/xeno/kernel/time.c
+++ b/xenolinux-2.4.24-sparse/arch/xeno/kernel/time.c
@@ -75,7 +75,7 @@ static u32 st_scale_i; /* convert ticks -> usecs */
 
 /* These are peridically updated in shared_info, and then copied here. */
 static u32 shadow_tsc_stamp;
-static s64 shadow_system_time;
+static u64 shadow_system_time;
 static u32 shadow_time_version;
 static struct timeval shadow_tv;
 
@@ -91,9 +91,12 @@ static long last_update_to_rtc, last_update_to_xen;
 #endif
 
 /* Periodically take synchronised time base from Xen, if we need it. */
-static long last_update_from_xen;
+static long last_update_from_xen;   /* UTC seconds when last read Xen clock. */
 
-static u64 processed_system_time;
+/* Keep track of last time we did processing/updating of jiffies and xtime. */
+static u64 processed_system_time;   /* System time (ns) at last processing. */
+
+#define NS_PER_TICK (1000000000ULL/HZ)
 
 #define HANDLE_USEC_UNDERFLOW(_tv)         \
     do {                                   \
@@ -197,8 +200,11 @@ static int set_rtc_mmss(unsigned long nowtime)
 #endif
 
 
-/* Must be called with the xtime_lock held for writing. */
-static void get_time_values_from_xen(void)
+/*
+ * Reads a consistent set of time-base values from Xen, into a shadow data
+ * area. Must be called with the xtime_lock held for writing.
+ */
+static void __get_time_values_from_xen(void)
 {
     do {
         shadow_time_version = HYPERVISOR_shared_info->time_version2;
@@ -216,7 +222,11 @@ static void get_time_values_from_xen(void)
     (shadow_time_version == HYPERVISOR_shared_info->time_version2)
 
 
-static inline unsigned long get_time_delta_usecs(void)
+/*
+ * Returns the system time elapsed, in ns, since the current shadow_timestamp
+ * was calculated. Must be called with the xtime_lock held for reading.
+ */
+static inline unsigned long __get_time_delta_usecs(void)
 {
     s32      delta_tsc;
     u32      low;
@@ -234,6 +244,9 @@ static inline unsigned long get_time_delta_usecs(void)
 }
 
 
+/*
+ * Returns the current time-of-day in UTC timeval format.
+ */
 void do_gettimeofday(struct timeval *tv)
 {
 	unsigned long flags, lost;
@@ -242,7 +255,7 @@ void do_gettimeofday(struct timeval *tv)
  again:
     read_lock_irqsave(&xtime_lock, flags);
 
-    _tv.tv_usec = get_time_delta_usecs();
+    _tv.tv_usec = __get_time_delta_usecs();
     if ( (lost = (jiffies - wall_jiffies)) != 0 )
         _tv.tv_usec += lost * (1000000 / HZ);
     _tv.tv_sec   = xtime.tv_sec;
@@ -257,7 +270,7 @@ void do_gettimeofday(struct timeval *tv)
          */
         read_unlock_irqrestore(&xtime_lock, flags);
         write_lock_irqsave(&xtime_lock, flags);
-        get_time_values_from_xen();
+        __get_time_values_from_xen();
         write_unlock_irqrestore(&xtime_lock, flags);
         goto again;
     }
@@ -276,6 +289,10 @@ void do_gettimeofday(struct timeval *tv)
     *tv = _tv;
 }
 
+
+/*
+ * Sets the current time-of-day based on passed-in UTC timeval parameter.
+ */
 void do_settimeofday(struct timeval *tv)
 {
     struct timeval newtv;
@@ -291,10 +308,10 @@ void do_settimeofday(struct timeval *tv)
      * be stale, so we can retry with fresh ones.
      */
  again:
-    tv->tv_usec -= get_time_delta_usecs();
+    tv->tv_usec -= __get_time_delta_usecs();
     if ( unlikely(!TIME_VALUES_UP_TO_DATE) )
     {
-        get_time_values_from_xen();
+        __get_time_values_from_xen();
         goto again;
     }
     
@@ -334,6 +351,7 @@ void do_settimeofday(struct timeval *tv)
     }
 }
 
+
 asmlinkage long sys_stime(int *tptr)
 {
 	int value;
@@ -353,14 +371,22 @@ asmlinkage long sys_stime(int *tptr)
 	return 0;
 }
 
-#define NS_PER_TICK (1000000000ULL/HZ)
+
+/* Convert jiffies to system time. Call with xtime_lock held for reading. */
+static inline u64 __jiffies_to_st(unsigned long j) 
+{
+    return processed_system_time + ((j - jiffies) * NS_PER_TICK);
+}
+
+
 static inline void do_timer_interrupt(int irq, void *dev_id,
                                       struct pt_regs *regs)
 {
     s64 delta;
+    unsigned long ticks = 0;
     long sec_diff;
 
-    get_time_values_from_xen();
+    __get_time_values_from_xen();
 
     if ( (delta = (s64)(shadow_system_time - processed_system_time)) < 0 )
     {
@@ -368,13 +394,24 @@ static inline void do_timer_interrupt(int irq, void *dev_id,
         return;
     }
 
+    /* Process elapsed jiffies since last call. */
     while ( delta >= NS_PER_TICK )
     {
-        do_timer(regs);
+        ticks++;
         delta -= NS_PER_TICK;
         processed_system_time += NS_PER_TICK;
     }
-    
+
+    if ( ticks != 0 )
+    {
+        do_timer_ticks(ticks);
+
+        if ( user_mode(regs) )
+            update_process_times_us(ticks, 0);
+        else
+            update_process_times_us(0, ticks);
+    }
+
     /*
      * Take synchronised time from Xen once a minute if we're not
      * synchronised ourselves, and we haven't chosen to keep an independent
@@ -446,6 +483,7 @@ static inline void do_timer_interrupt(int irq, void *dev_id,
 #endif
 }
 
+
 static void timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 {
     write_lock(&xtime_lock);
@@ -463,6 +501,89 @@ static struct irqaction irq_timer = {
     NULL
 };
 
+
+/*
+ * This function works out when the the next timer function has to be
+ * executed (by looking at the timer list) and sets the Xen one-shot
+ * domain timer to the appropriate value. This is typically called in
+ * cpu_idle() before the domain blocks.
+ * 
+ * The function returns a non-0 value on error conditions.
+ * 
+ * It must be called with interrupts disabled.
+ */
+extern spinlock_t timerlist_lock;
+int set_timeout_timer(void)
+{
+    struct timer_list *timer;
+    u64 alarm = 0;
+    int ret = 0;
+
+    spin_lock(&timerlist_lock);
+
+    /*
+     * This is safe against long blocking (since calculations are not based on 
+     * TSC deltas). It is also safe against warped system time since
+     * suspend-resume is cooperative and we would first get locked out. It is 
+     * safe against normal updates of jiffies since interrupts are off.
+     */
+    if ( (timer = next_timer_event()) != NULL )
+        alarm = __jiffies_to_st(timer->expires);
+
+    /* Failure is pretty bad, but we'd best soldier on. */
+    if ( HYPERVISOR_set_dom_timer(alarm) != 0 )
+        ret = -1;
+    
+    spin_unlock(&timerlist_lock);
+
+    return ret;
+}
+
+
+/* Time debugging. */
+static void dbg_time_int(int irq, void *dev_id, struct pt_regs *ptregs)
+{
+    unsigned long flags, j;
+    u64 s_now, j_st;
+    struct timeval s_tv, tv;
+
+    struct timer_list *timer;
+    u64 t_st;
+
+    read_lock_irqsave(&xtime_lock, flags);
+    s_tv.tv_sec  = shadow_tv.tv_sec;
+    s_tv.tv_usec = shadow_tv.tv_usec;
+    s_now        = shadow_system_time;
+    read_unlock_irqrestore(&xtime_lock, flags);
+
+    do_gettimeofday(&tv);
+
+    j = jiffies;
+    j_st = __jiffies_to_st(j);
+
+    timer = next_timer_event();
+    t_st = __jiffies_to_st(timer->expires);
+
+    printk(KERN_ALERT "time: shadow_st=0x%X:%08X\n",
+           (u32)(s_now>>32), (u32)s_now);
+    printk(KERN_ALERT "time: wct=%lds %ldus shadow_wct=%lds %ldus\n",
+           tv.tv_sec, tv.tv_usec, s_tv.tv_sec, s_tv.tv_usec);
+    printk(KERN_ALERT "time: jiffies=%lu(0x%X:%08X) timeout=%lu(0x%X:%08X)\n",
+           jiffies,(u32)(j_st>>32), (u32)j_st,
+           timer->expires,(u32)(t_st>>32), (u32)t_st);
+    printk(KERN_ALERT "time: processed_system_time=0x%X:%08X\n",
+           (u32)(processed_system_time>>32), (u32)processed_system_time);
+}
+
+static struct irqaction dbg_time = {
+    dbg_time_int, 
+    SA_SHIRQ, 
+    0, 
+    "timer_dbg", 
+    &dbg_time_int,
+    NULL
+};
+
 void __init time_init(void)
 {
     unsigned long long alarm;
@@ -494,10 +615,12 @@ void __init time_init(void)
     st_scale_f = scale & 0xffffffff;
     st_scale_i = scale >> 32;
 
-    get_time_values_from_xen();
+    __get_time_values_from_xen();
     processed_system_time = shadow_system_time;
 
-    setup_irq(TIMER_IRQ, &irq_timer);
+    (void)setup_irq(TIMER_IRQ, &irq_timer);
+
+    (void)setup_irq(_EVENT_DEBUG, &dbg_time);
 
     rdtscll(alarm);
 
diff --git a/xenolinux-2.4.24-sparse/include/asm-xeno/hypervisor.h b/xenolinux-2.4.24-sparse/include/asm-xeno/hypervisor.h
index 064088ff6f..34272a624f 100644
--- a/xenolinux-2.4.24-sparse/include/asm-xeno/hypervisor.h
+++ b/xenolinux-2.4.24-sparse/include/asm-xeno/hypervisor.h
@@ -256,6 +256,17 @@ static inline int HYPERVISOR_yield(void)
     return ret;
 }
 
+static inline int HYPERVISOR_block(void)
+{
+    int ret;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_sched_op),
+        "b" (SCHEDOP_block) );
+
+    return ret;
+}
+
 static inline int HYPERVISOR_exit(void)
 {
     int ret;
@@ -279,6 +290,19 @@ static inline int HYPERVISOR_stop(unsigned long srec)
     return ret;
 }
 
+static inline long HYPERVISOR_set_dom_timer(u64 timeout)
+{
+    int ret;
+    unsigned long timeout_hi = (unsigned long)(timeout>>32);
+    unsigned long timeout_lo = (unsigned long)timeout;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret) : "0" (__HYPERVISOR_set_dom_timer),
+        "b" (timeout_hi), "c" (timeout_lo) : "memory" );
+
+    return ret;
+}
+
 static inline int HYPERVISOR_dom0_op(dom0_op_t *dom0_op)
 {
     int ret;
diff --git a/xenolinux-2.4.24-sparse/include/linux/sched.h b/xenolinux-2.4.24-sparse/include/linux/sched.h
new file mode 100644
index 0000000000..ed42340517
--- /dev/null
+++ b/xenolinux-2.4.24-sparse/include/linux/sched.h
@@ -0,0 +1,966 @@
+#ifndef _LINUX_SCHED_H
+#define _LINUX_SCHED_H
+
+#include <asm/param.h>	/* for HZ */
+
+extern unsigned long event;
+
+#include <linux/config.h>
+#include <linux/binfmts.h>
+#include <linux/threads.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/times.h>
+#include <linux/timex.h>
+#include <linux/rbtree.h>
+
+#include <asm/system.h>
+#include <asm/semaphore.h>
+#include <asm/page.h>
+#include <asm/ptrace.h>
+#include <asm/mmu.h>
+
+#include <linux/smp.h>
+#include <linux/tty.h>
+#include <linux/sem.h>
+#include <linux/signal.h>
+#include <linux/securebits.h>
+#include <linux/fs_struct.h>
+
+struct exec_domain;
+
+/*
+ * cloning flags:
+ */
+#define CSIGNAL		0x000000ff	/* signal mask to be sent at exit */
+#define CLONE_VM	0x00000100	/* set if VM shared between processes */
+#define CLONE_FS	0x00000200	/* set if fs info shared between processes */
+#define CLONE_FILES	0x00000400	/* set if open files shared between processes */
+#define CLONE_SIGHAND	0x00000800	/* set if signal handlers and blocked signals shared */
+#define CLONE_PID	0x00001000	/* set if pid shared */
+#define CLONE_PTRACE	0x00002000	/* set if we want to let tracing continue on the child too */
+#define CLONE_VFORK	0x00004000	/* set if the parent wants the child to wake it up on mm_release */
+#define CLONE_PARENT	0x00008000	/* set if we want to have the same parent as the cloner */
+#define CLONE_THREAD	0x00010000	/* Same thread group? */
+#define CLONE_NEWNS	0x00020000	/* New namespace group? */
+
+#define CLONE_SIGNAL	(CLONE_SIGHAND | CLONE_THREAD)
+
+/*
+ * These are the constant used to fake the fixed-point load-average
+ * counting. Some notes:
+ *  - 11 bit fractions expand to 22 bits by the multiplies: this gives
+ *    a load-average precision of 10 bits integer + 11 bits fractional
+ *  - if you want to count load-averages more often, you need more
+ *    precision, or rounding will get you. With 2-second counting freq,
+ *    the EXP_n values would be 1981, 2034 and 2043 if still using only
+ *    11 bit fractions.
+ */
+extern unsigned long avenrun[];		/* Load averages */
+
+#define FSHIFT		11		/* nr of bits of precision */
+#define FIXED_1		(1<<FSHIFT)	/* 1.0 as fixed-point */
+#define LOAD_FREQ	(5*HZ)		/* 5 sec intervals */
+#define EXP_1		1884		/* 1/exp(5sec/1min) as fixed-point */
+#define EXP_5		2014		/* 1/exp(5sec/5min) */
+#define EXP_15		2037		/* 1/exp(5sec/15min) */
+
+#define CALC_LOAD(load,exp,n) \
+	load *= exp; \
+	load += n*(FIXED_1-exp); \
+	load >>= FSHIFT;
+
+#define CT_TO_SECS(x)	((x) / HZ)
+#define CT_TO_USECS(x)	(((x) % HZ) * 1000000/HZ)
+
+extern int nr_running, nr_threads;
+extern int last_pid;
+
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/param.h>
+#include <linux/resource.h>
+#ifdef __KERNEL__
+#include <linux/timer.h>
+#endif
+
+#include <asm/processor.h>
+
+#define TASK_RUNNING		0
+#define TASK_INTERRUPTIBLE	1
+#define TASK_UNINTERRUPTIBLE	2
+#define TASK_ZOMBIE		4
+#define TASK_STOPPED		8
+
+#define __set_task_state(tsk, state_value)		\
+	do { (tsk)->state = (state_value); } while (0)
+#define set_task_state(tsk, state_value)		\
+	set_mb((tsk)->state, (state_value))
+
+#define __set_current_state(state_value)			\
+	do { current->state = (state_value); } while (0)
+#define set_current_state(state_value)		\
+	set_mb(current->state, (state_value))
+
+/*
+ * Scheduling policies
+ */
+#define SCHED_OTHER		0
+#define SCHED_FIFO		1
+#define SCHED_RR		2
+
+/*
+ * This is an additional bit set when we want to
+ * yield the CPU for one re-schedule..
+ */
+#define SCHED_YIELD		0x10
+
+struct sched_param {
+	int sched_priority;
+};
+
+struct completion;
+
+#ifdef __KERNEL__
+
+#include <linux/spinlock.h>
+
+/*
+ * This serializes "schedule()" and also protects
+ * the run-queue from deletions/modifications (but
+ * _adding_ to the beginning of the run-queue has
+ * a separate lock).
+ */
+extern rwlock_t tasklist_lock;
+extern spinlock_t runqueue_lock;
+extern spinlock_t mmlist_lock;
+
+extern void sched_init(void);
+extern void init_idle(void);
+extern void show_state(void);
+extern void cpu_init (void);
+extern void trap_init(void);
+extern void update_process_times(int user);
+#ifdef CONFIG_NO_IDLE_HZ
+extern void update_process_times_us(int user, int system);
+#endif
+extern void update_one_process(struct task_struct *p, unsigned long user,
+			       unsigned long system, int cpu);
+
+#define	MAX_SCHEDULE_TIMEOUT	LONG_MAX
+extern signed long FASTCALL(schedule_timeout(signed long timeout));
+asmlinkage void schedule(void);
+
+extern int schedule_task(struct tq_struct *task);
+extern void flush_scheduled_tasks(void);
+extern int start_context_thread(void);
+extern int current_is_keventd(void);
+
+#if CONFIG_SMP
+extern void set_cpus_allowed(struct task_struct *p, unsigned long new_mask);
+#else
+# define set_cpus_allowed(p, new_mask) do { } while (0)
+#endif
+
+/*
+ * The default fd array needs to be at least BITS_PER_LONG,
+ * as this is the granularity returned by copy_fdset().
+ */
+#define NR_OPEN_DEFAULT BITS_PER_LONG
+
+struct namespace;
+/*
+ * Open file table structure
+ */
+struct files_struct {
+	atomic_t count;
+	rwlock_t file_lock;	/* Protects all the below members.  Nests inside tsk->alloc_lock */
+	int max_fds;
+	int max_fdset;
+	int next_fd;
+	struct file ** fd;	/* current fd array */
+	fd_set *close_on_exec;
+	fd_set *open_fds;
+	fd_set close_on_exec_init;
+	fd_set open_fds_init;
+	struct file * fd_array[NR_OPEN_DEFAULT];
+};
+
+#define INIT_FILES \
+{ 							\
+	count:		ATOMIC_INIT(1), 		\
+	file_lock:	RW_LOCK_UNLOCKED, 		\
+	max_fds:	NR_OPEN_DEFAULT, 		\
+	max_fdset:	__FD_SETSIZE, 			\
+	next_fd:	0, 				\
+	fd:		&init_files.fd_array[0], 	\
+	close_on_exec:	&init_files.close_on_exec_init, \
+	open_fds:	&init_files.open_fds_init, 	\
+	close_on_exec_init: { { 0, } }, 		\
+	open_fds_init:	{ { 0, } }, 			\
+	fd_array:	{ NULL, } 			\
+}
+
+/* Maximum number of active map areas.. This is a random (large) number */
+#define DEFAULT_MAX_MAP_COUNT	(65536)
+
+extern int max_map_count;
+
+struct mm_struct {
+	struct vm_area_struct * mmap;		/* list of VMAs */
+	rb_root_t mm_rb;
+	struct vm_area_struct * mmap_cache;	/* last find_vma result */
+	pgd_t * pgd;
+	atomic_t mm_users;			/* How many users with user space? */
+	atomic_t mm_count;			/* How many references to "struct mm_struct" (users count as 1) */
+	int map_count;				/* number of VMAs */
+	struct rw_semaphore mmap_sem;
+	spinlock_t page_table_lock;		/* Protects task page tables and mm->rss */
+
+	struct list_head mmlist;		/* List of all active mm's.  These are globally strung
+						 * together off init_mm.mmlist, and are protected
+						 * by mmlist_lock
+						 */
+
+	unsigned long start_code, end_code, start_data, end_data;
+	unsigned long start_brk, brk, start_stack;
+	unsigned long arg_start, arg_end, env_start, env_end;
+	unsigned long rss, total_vm, locked_vm;
+	unsigned long def_flags;
+	unsigned long cpu_vm_mask;
+	unsigned long swap_address;
+
+	unsigned dumpable:1;
+
+	/* Architecture-specific MM context */
+	mm_context_t context;
+};
+
+extern int mmlist_nr;
+
+#define INIT_MM(name) \
+{			 				\
+	mm_rb:		RB_ROOT,			\
+	pgd:		swapper_pg_dir, 		\
+	mm_users:	ATOMIC_INIT(2), 		\
+	mm_count:	ATOMIC_INIT(1), 		\
+	mmap_sem:	__RWSEM_INITIALIZER(name.mmap_sem), \
+	page_table_lock: SPIN_LOCK_UNLOCKED, 		\
+	mmlist:		LIST_HEAD_INIT(name.mmlist),	\
+}
+
+struct signal_struct {
+	atomic_t		count;
+	struct k_sigaction	action[_NSIG];
+	spinlock_t		siglock;
+};
+
+
+#define INIT_SIGNALS {	\
+	count:		ATOMIC_INIT(1), 		\
+	action:		{ {{0,}}, }, 			\
+	siglock:	SPIN_LOCK_UNLOCKED 		\
+}
+
+/*
+ * Some day this will be a full-fledged user tracking system..
+ */
+struct user_struct {
+	atomic_t __count;	/* reference count */
+	atomic_t processes;	/* How many processes does this user have? */
+	atomic_t files;		/* How many open files does this user have? */
+
+	/* Hash table maintenance information */
+	struct user_struct *next, **pprev;
+	uid_t uid;
+};
+
+#define get_current_user() ({ 				\
+	struct user_struct *__user = current->user;	\
+	atomic_inc(&__user->__count);			\
+	__user; })
+
+extern struct user_struct root_user;
+#define INIT_USER (&root_user)
+
+struct task_struct {
+	/*
+	 * offsets of these are hardcoded elsewhere - touch with care
+	 */
+	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
+	unsigned long flags;	/* per process flags, defined below */
+	int sigpending;
+	mm_segment_t addr_limit;	/* thread address space:
+					 	0-0xBFFFFFFF for user-thead
+						0-0xFFFFFFFF for kernel-thread
+					 */
+	struct exec_domain *exec_domain;
+	volatile long need_resched;
+	unsigned long ptrace;
+
+	int lock_depth;		/* Lock depth */
+
+/*
+ * offset 32 begins here on 32-bit platforms. We keep
+ * all fields in a single cacheline that are needed for
+ * the goodness() loop in schedule().
+ */
+	long counter;
+	long nice;
+	unsigned long policy;
+	struct mm_struct *mm;
+	int processor;
+	/*
+	 * cpus_runnable is ~0 if the process is not running on any
+	 * CPU. It's (1 << cpu) if it's running on a CPU. This mask
+	 * is updated under the runqueue lock.
+	 *
+	 * To determine whether a process might run on a CPU, this
+	 * mask is AND-ed with cpus_allowed.
+	 */
+	unsigned long cpus_runnable, cpus_allowed;
+	/*
+	 * (only the 'next' pointer fits into the cacheline, but
+	 * that's just fine.)
+	 */
+	struct list_head run_list;
+	unsigned long sleep_time;
+
+	struct task_struct *next_task, *prev_task;
+	struct mm_struct *active_mm;
+	struct list_head local_pages;
+	unsigned int allocation_order, nr_local_pages;
+
+/* task state */
+	struct linux_binfmt *binfmt;
+	int exit_code, exit_signal;
+	int pdeath_signal;  /*  The signal sent when the parent dies  */
+	/* ??? */
+	unsigned long personality;
+	int did_exec:1;
+	unsigned task_dumpable:1;
+	pid_t pid;
+	pid_t pgrp;
+	pid_t tty_old_pgrp;
+	pid_t session;
+	pid_t tgid;
+	/* boolean value for session group leader */
+	int leader;
+	/* 
+	 * pointers to (original) parent process, youngest child, younger sibling,
+	 * older sibling, respectively.  (p->father can be replaced with 
+	 * p->p_pptr->pid)
+	 */
+	struct task_struct *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr;
+	struct list_head thread_group;
+
+	/* PID hash table linkage. */
+	struct task_struct *pidhash_next;
+	struct task_struct **pidhash_pprev;
+
+	wait_queue_head_t wait_chldexit;	/* for wait4() */
+	struct completion *vfork_done;		/* for vfork() */
+	unsigned long rt_priority;
+	unsigned long it_real_value, it_prof_value, it_virt_value;
+	unsigned long it_real_incr, it_prof_incr, it_virt_incr;
+	struct timer_list real_timer;
+	struct tms times;
+	unsigned long start_time;
+	long per_cpu_utime[NR_CPUS], per_cpu_stime[NR_CPUS];
+/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
+	unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap;
+	int swappable:1;
+/* process credentials */
+	uid_t uid,euid,suid,fsuid;
+	gid_t gid,egid,sgid,fsgid;
+	int ngroups;
+	gid_t	groups[NGROUPS];
+	kernel_cap_t   cap_effective, cap_inheritable, cap_permitted;
+	int keep_capabilities:1;
+	struct user_struct *user;
+/* limits */
+	struct rlimit rlim[RLIM_NLIMITS];
+	unsigned short used_math;
+	char comm[16];
+/* file system info */
+	int link_count, total_link_count;
+	struct tty_struct *tty; /* NULL if no tty */
+	unsigned int locks; /* How many file locks are being held */
+/* ipc stuff */
+	struct sem_undo *semundo;
+	struct sem_queue *semsleeping;
+/* CPU-specific state of this task */
+	struct thread_struct thread;
+/* filesystem information */
+	struct fs_struct *fs;
+/* open file information */
+	struct files_struct *files;
+/* namespace */
+	struct namespace *namespace;
+/* signal handlers */
+	spinlock_t sigmask_lock;	/* Protects signal and blocked */
+	struct signal_struct *sig;
+
+	sigset_t blocked;
+	struct sigpending pending;
+
+	unsigned long sas_ss_sp;
+	size_t sas_ss_size;
+	int (*notifier)(void *priv);
+	void *notifier_data;
+	sigset_t *notifier_mask;
+	
+/* Thread group tracking */
+   	u32 parent_exec_id;
+   	u32 self_exec_id;
+/* Protection of (de-)allocation: mm, files, fs, tty */
+	spinlock_t alloc_lock;
+
+/* journalling filesystem info */
+	void *journal_info;
+};
+
+/*
+ * Per process flags
+ */
+#define PF_ALIGNWARN	0x00000001	/* Print alignment warning msgs */
+					/* Not implemented yet, only for 486*/
+#define PF_STARTING	0x00000002	/* being created */
+#define PF_EXITING	0x00000004	/* getting shut down */
+#define PF_FORKNOEXEC	0x00000040	/* forked but didn't exec */
+#define PF_SUPERPRIV	0x00000100	/* used super-user privileges */
+#define PF_DUMPCORE	0x00000200	/* dumped core */
+#define PF_SIGNALED	0x00000400	/* killed by a signal */
+#define PF_MEMALLOC	0x00000800	/* Allocating memory */
+#define PF_FREE_PAGES	0x00002000	/* per process page freeing */
+#define PF_NOIO		0x00004000	/* avoid generating further I/O */
+
+#define PF_USEDFPU	0x00100000	/* task used FPU this quantum (SMP) */
+
+/*
+ * Ptrace flags
+ */
+
+#define PT_PTRACED	0x00000001
+#define PT_TRACESYS	0x00000002
+#define PT_DTRACE	0x00000004	/* delayed trace (used on m68k, i386) */
+#define PT_TRACESYSGOOD	0x00000008
+#define PT_PTRACE_CAP	0x00000010	/* ptracer can follow suid-exec */
+
+#define is_dumpable(tsk)    ((tsk)->task_dumpable && (tsk)->mm && (tsk)->mm->dumpable)
+
+/*
+ * Limit the stack by to some sane default: root can always
+ * increase this limit if needed..  8MB seems reasonable.
+ */
+#define _STK_LIM	(8*1024*1024)
+
+#define DEF_COUNTER	(10*HZ/100)	/* 100 ms time slice */
+#define MAX_COUNTER	(20*HZ/100)
+#define DEF_NICE	(0)
+
+extern void yield(void);
+
+/*
+ * The default (Linux) execution domain.
+ */
+extern struct exec_domain	default_exec_domain;
+
+/*
+ *  INIT_TASK is used to set up the first task table, touch at
+ * your own risk!. Base=0, limit=0x1fffff (=2MB)
+ */
+#define INIT_TASK(tsk)	\
+{									\
+    state:		0,						\
+    flags:		0,						\
+    sigpending:		0,						\
+    addr_limit:		KERNEL_DS,					\
+    exec_domain:	&default_exec_domain,				\
+    lock_depth:		-1,						\
+    counter:		DEF_COUNTER,					\
+    nice:		DEF_NICE,					\
+    policy:		SCHED_OTHER,					\
+    mm:			NULL,						\
+    active_mm:		&init_mm,					\
+    cpus_runnable:	~0UL,						\
+    cpus_allowed:	~0UL,						\
+    run_list:		LIST_HEAD_INIT(tsk.run_list),			\
+    next_task:		&tsk,						\
+    prev_task:		&tsk,						\
+    p_opptr:		&tsk,						\
+    p_pptr:		&tsk,						\
+    thread_group:	LIST_HEAD_INIT(tsk.thread_group),		\
+    wait_chldexit:	__WAIT_QUEUE_HEAD_INITIALIZER(tsk.wait_chldexit),\
+    real_timer:		{						\
+	function:		it_real_fn				\
+    },									\
+    cap_effective:	CAP_INIT_EFF_SET,				\
+    cap_inheritable:	CAP_INIT_INH_SET,				\
+    cap_permitted:	CAP_FULL_SET,					\
+    keep_capabilities:	0,						\
+    rlim:		INIT_RLIMITS,					\
+    user:		INIT_USER,					\
+    comm:		"swapper",					\
+    thread:		INIT_THREAD,					\
+    fs:			&init_fs,					\
+    files:		&init_files,					\
+    sigmask_lock:	SPIN_LOCK_UNLOCKED,				\
+    sig:		&init_signals,					\
+    pending:		{ NULL, &tsk.pending.head, {{0}}},		\
+    blocked:		{{0}},						\
+    alloc_lock:		SPIN_LOCK_UNLOCKED,				\
+    journal_info:	NULL,						\
+}
+
+
+#ifndef INIT_TASK_SIZE
+# define INIT_TASK_SIZE	2048*sizeof(long)
+#endif
+
+union task_union {
+	struct task_struct task;
+	unsigned long stack[INIT_TASK_SIZE/sizeof(long)];
+};
+
+extern union task_union init_task_union;
+
+extern struct   mm_struct init_mm;
+extern struct task_struct *init_tasks[NR_CPUS];
+
+/* PID hashing. (shouldnt this be dynamic?) */
+#define PIDHASH_SZ (4096 >> 2)
+extern struct task_struct *pidhash[PIDHASH_SZ];
+
+#define pid_hashfn(x)	((((x) >> 8) ^ (x)) & (PIDHASH_SZ - 1))
+
+static inline void hash_pid(struct task_struct *p)
+{
+	struct task_struct **htable = &pidhash[pid_hashfn(p->pid)];
+
+	if((p->pidhash_next = *htable) != NULL)
+		(*htable)->pidhash_pprev = &p->pidhash_next;
+	*htable = p;
+	p->pidhash_pprev = htable;
+}
+
+static inline void unhash_pid(struct task_struct *p)
+{
+	if(p->pidhash_next)
+		p->pidhash_next->pidhash_pprev = p->pidhash_pprev;
+	*p->pidhash_pprev = p->pidhash_next;
+}
+
+static inline struct task_struct *find_task_by_pid(int pid)
+{
+	struct task_struct *p, **htable = &pidhash[pid_hashfn(pid)];
+
+	for(p = *htable; p && p->pid != pid; p = p->pidhash_next)
+		;
+
+	return p;
+}
+
+#define task_has_cpu(tsk) ((tsk)->cpus_runnable != ~0UL)
+
+static inline void task_set_cpu(struct task_struct *tsk, unsigned int cpu)
+{
+	tsk->processor = cpu;
+	tsk->cpus_runnable = 1UL << cpu;
+}
+
+static inline void task_release_cpu(struct task_struct *tsk)
+{
+	tsk->cpus_runnable = ~0UL;
+}
+
+/* per-UID process charging. */
+extern struct user_struct * alloc_uid(uid_t);
+extern void free_uid(struct user_struct *);
+extern void switch_uid(struct user_struct *);
+
+#include <asm/current.h>
+
+extern unsigned long volatile jiffies;
+extern unsigned long itimer_ticks;
+extern unsigned long itimer_next;
+extern struct timeval xtime;
+extern void do_timer(struct pt_regs *);
+#ifdef CONFIG_NO_IDLE_HZ
+extern void do_timer_ticks(int ticks);
+#endif
+
+extern unsigned int * prof_buffer;
+extern unsigned long prof_len;
+extern unsigned long prof_shift;
+
+#define CURRENT_TIME (xtime.tv_sec)
+
+extern void FASTCALL(__wake_up(wait_queue_head_t *q, unsigned int mode, int nr));
+extern void FASTCALL(__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr));
+extern void FASTCALL(sleep_on(wait_queue_head_t *q));
+extern long FASTCALL(sleep_on_timeout(wait_queue_head_t *q,
+				      signed long timeout));
+extern void FASTCALL(interruptible_sleep_on(wait_queue_head_t *q));
+extern long FASTCALL(interruptible_sleep_on_timeout(wait_queue_head_t *q,
+						    signed long timeout));
+extern int FASTCALL(wake_up_process(struct task_struct * tsk));
+
+#define wake_up(x)			__wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1)
+#define wake_up_nr(x, nr)		__wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr)
+#define wake_up_all(x)			__wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 0)
+#define wake_up_sync(x)			__wake_up_sync((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1)
+#define wake_up_sync_nr(x, nr)		__wake_up_sync((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr)
+#define wake_up_interruptible(x)	__wake_up((x),TASK_INTERRUPTIBLE, 1)
+#define wake_up_interruptible_nr(x, nr)	__wake_up((x),TASK_INTERRUPTIBLE, nr)
+#define wake_up_interruptible_all(x)	__wake_up((x),TASK_INTERRUPTIBLE, 0)
+#define wake_up_interruptible_sync(x)	__wake_up_sync((x),TASK_INTERRUPTIBLE, 1)
+#define wake_up_interruptible_sync_nr(x, nr) __wake_up_sync((x),TASK_INTERRUPTIBLE,  nr)
+asmlinkage long sys_wait4(pid_t pid,unsigned int * stat_addr, int options, struct rusage * ru);
+
+extern int in_group_p(gid_t);
+extern int in_egroup_p(gid_t);
+
+extern void proc_caches_init(void);
+extern void flush_signals(struct task_struct *);
+extern void flush_signal_handlers(struct task_struct *);
+extern void sig_exit(int, int, struct siginfo *);
+extern int dequeue_signal(sigset_t *, siginfo_t *);
+extern void block_all_signals(int (*notifier)(void *priv), void *priv,
+			      sigset_t *mask);
+extern void unblock_all_signals(void);
+extern int send_sig_info(int, struct siginfo *, struct task_struct *);
+extern int force_sig_info(int, struct siginfo *, struct task_struct *);
+extern int kill_pg_info(int, struct siginfo *, pid_t);
+extern int kill_sl_info(int, struct siginfo *, pid_t);
+extern int kill_proc_info(int, struct siginfo *, pid_t);
+extern void notify_parent(struct task_struct *, int);
+extern void do_notify_parent(struct task_struct *, int);
+extern void force_sig(int, struct task_struct *);
+extern int send_sig(int, struct task_struct *, int);
+extern int kill_pg(pid_t, int, int);
+extern int kill_sl(pid_t, int, int);
+extern int kill_proc(pid_t, int, int);
+extern int do_sigaction(int, const struct k_sigaction *, struct k_sigaction *);
+extern int do_sigaltstack(const stack_t *, stack_t *, unsigned long);
+
+static inline int signal_pending(struct task_struct *p)
+{
+	return (p->sigpending != 0);
+}
+
+/*
+ * Re-calculate pending state from the set of locally pending
+ * signals, globally pending signals, and blocked signals.
+ */
+static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
+{
+	unsigned long ready;
+	long i;
+
+	switch (_NSIG_WORDS) {
+	default:
+		for (i = _NSIG_WORDS, ready = 0; --i >= 0 ;)
+			ready |= signal->sig[i] &~ blocked->sig[i];
+		break;
+
+	case 4: ready  = signal->sig[3] &~ blocked->sig[3];
+		ready |= signal->sig[2] &~ blocked->sig[2];
+		ready |= signal->sig[1] &~ blocked->sig[1];
+		ready |= signal->sig[0] &~ blocked->sig[0];
+		break;
+
+	case 2: ready  = signal->sig[1] &~ blocked->sig[1];
+		ready |= signal->sig[0] &~ blocked->sig[0];
+		break;
+
+	case 1: ready  = signal->sig[0] &~ blocked->sig[0];
+	}
+	return ready !=	0;
+}
+
+/* Reevaluate whether the task has signals pending delivery.
+   This is required every time the blocked sigset_t changes.
+   All callers should have t->sigmask_lock.  */
+
+static inline void recalc_sigpending(struct task_struct *t)
+{
+	t->sigpending = has_pending_signals(&t->pending.signal, &t->blocked);
+}
+
+/* True if we are on the alternate signal stack.  */
+
+static inline int on_sig_stack(unsigned long sp)
+{
+	return (sp - current->sas_ss_sp < current->sas_ss_size);
+}
+
+static inline int sas_ss_flags(unsigned long sp)
+{
+	return (current->sas_ss_size == 0 ? SS_DISABLE
+		: on_sig_stack(sp) ? SS_ONSTACK : 0);
+}
+
+extern int request_irq(unsigned int,
+		       void (*handler)(int, void *, struct pt_regs *),
+		       unsigned long, const char *, void *);
+extern void free_irq(unsigned int, void *);
+
+/*
+ * This has now become a routine instead of a macro, it sets a flag if
+ * it returns true (to do BSD-style accounting where the process is flagged
+ * if it uses root privs). The implication of this is that you should do
+ * normal permissions checks first, and check suser() last.
+ *
+ * [Dec 1997 -- Chris Evans]
+ * For correctness, the above considerations need to be extended to
+ * fsuser(). This is done, along with moving fsuser() checks to be
+ * last.
+ *
+ * These will be removed, but in the mean time, when the SECURE_NOROOT 
+ * flag is set, uids don't grant privilege.
+ */
+static inline int suser(void)
+{
+	if (!issecure(SECURE_NOROOT) && current->euid == 0) { 
+		current->flags |= PF_SUPERPRIV;
+		return 1;
+	}
+	return 0;
+}
+
+static inline int fsuser(void)
+{
+	if (!issecure(SECURE_NOROOT) && current->fsuid == 0) {
+		current->flags |= PF_SUPERPRIV;
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * capable() checks for a particular capability.  
+ * New privilege checks should use this interface, rather than suser() or
+ * fsuser(). See include/linux/capability.h for defined capabilities.
+ */
+
+static inline int capable(int cap)
+{
+#if 1 /* ok now */
+	if (cap_raised(current->cap_effective, cap))
+#else
+	if (cap_is_fs_cap(cap) ? current->fsuid == 0 : current->euid == 0)
+#endif
+	{
+		current->flags |= PF_SUPERPRIV;
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * Routines for handling mm_structs
+ */
+extern struct mm_struct * mm_alloc(void);
+
+extern struct mm_struct * start_lazy_tlb(void);
+extern void end_lazy_tlb(struct mm_struct *mm);
+
+/* mmdrop drops the mm and the page tables */
+extern inline void FASTCALL(__mmdrop(struct mm_struct *));
+static inline void mmdrop(struct mm_struct * mm)
+{
+	if (atomic_dec_and_test(&mm->mm_count))
+		__mmdrop(mm);
+}
+
+/* mmput gets rid of the mappings and all user-space */
+extern void mmput(struct mm_struct *);
+/* Remove the current tasks stale references to the old mm_struct */
+extern void mm_release(void);
+
+/*
+ * Routines for handling the fd arrays
+ */
+extern struct file ** alloc_fd_array(int);
+extern int expand_fd_array(struct files_struct *, int nr);
+extern void free_fd_array(struct file **, int);
+
+extern fd_set *alloc_fdset(int);
+extern int expand_fdset(struct files_struct *, int nr);
+extern void free_fdset(fd_set *, int);
+
+extern int  copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *);
+extern void flush_thread(void);
+extern void exit_thread(void);
+
+extern void exit_mm(struct task_struct *);
+extern void exit_files(struct task_struct *);
+extern void exit_sighand(struct task_struct *);
+
+extern void reparent_to_init(void);
+extern void daemonize(void);
+
+extern int do_execve(char *, char **, char **, struct pt_regs *);
+extern int do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long);
+
+extern void FASTCALL(add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait));
+extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait));
+extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait));
+
+extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
+
+#define __wait_event(wq, condition) 					\
+do {									\
+	wait_queue_t __wait;						\
+	init_waitqueue_entry(&__wait, current);				\
+									\
+	add_wait_queue(&wq, &__wait);					\
+	for (;;) {							\
+		set_current_state(TASK_UNINTERRUPTIBLE);		\
+		if (condition)						\
+			break;						\
+		schedule();						\
+	}								\
+	current->state = TASK_RUNNING;					\
+	remove_wait_queue(&wq, &__wait);				\
+} while (0)
+
+#define wait_event(wq, condition) 					\
+do {									\
+	if (condition)	 						\
+		break;							\
+	__wait_event(wq, condition);					\
+} while (0)
+
+#define __wait_event_interruptible(wq, condition, ret)			\
+do {									\
+	wait_queue_t __wait;						\
+	init_waitqueue_entry(&__wait, current);				\
+									\
+	add_wait_queue(&wq, &__wait);					\
+	for (;;) {							\
+		set_current_state(TASK_INTERRUPTIBLE);			\
+		if (condition)						\
+			break;						\
+		if (!signal_pending(current)) {				\
+			schedule();					\
+			continue;					\
+		}							\
+		ret = -ERESTARTSYS;					\
+		break;							\
+	}								\
+	current->state = TASK_RUNNING;					\
+	remove_wait_queue(&wq, &__wait);				\
+} while (0)
+	
+#define wait_event_interruptible(wq, condition)				\
+({									\
+	int __ret = 0;							\
+	if (!(condition))						\
+		__wait_event_interruptible(wq, condition, __ret);	\
+	__ret;								\
+})
+
+#define REMOVE_LINKS(p) do { \
+	(p)->next_task->prev_task = (p)->prev_task; \
+	(p)->prev_task->next_task = (p)->next_task; \
+	if ((p)->p_osptr) \
+		(p)->p_osptr->p_ysptr = (p)->p_ysptr; \
+	if ((p)->p_ysptr) \
+		(p)->p_ysptr->p_osptr = (p)->p_osptr; \
+	else \
+		(p)->p_pptr->p_cptr = (p)->p_osptr; \
+	} while (0)
+
+#define SET_LINKS(p) do { \
+	(p)->next_task = &init_task; \
+	(p)->prev_task = init_task.prev_task; \
+	init_task.prev_task->next_task = (p); \
+	init_task.prev_task = (p); \
+	(p)->p_ysptr = NULL; \
+	if (((p)->p_osptr = (p)->p_pptr->p_cptr) != NULL) \
+		(p)->p_osptr->p_ysptr = p; \
+	(p)->p_pptr->p_cptr = p; \
+	} while (0)
+
+#define for_each_task(p) \
+	for (p = &init_task ; (p = p->next_task) != &init_task ; )
+
+#define for_each_thread(task) \
+	for (task = next_thread(current) ; task != current ; task = next_thread(task))
+
+#define next_thread(p) \
+	list_entry((p)->thread_group.next, struct task_struct, thread_group)
+
+#define thread_group_leader(p)	(p->pid == p->tgid)
+
+static inline void del_from_runqueue(struct task_struct * p)
+{
+	nr_running--;
+	p->sleep_time = jiffies;
+	list_del(&p->run_list);
+	p->run_list.next = NULL;
+}
+
+static inline int task_on_runqueue(struct task_struct *p)
+{
+	return (p->run_list.next != NULL);
+}
+
+static inline void unhash_process(struct task_struct *p)
+{
+	if (task_on_runqueue(p))
+		out_of_line_bug();
+	write_lock_irq(&tasklist_lock);
+	nr_threads--;
+	unhash_pid(p);
+	REMOVE_LINKS(p);
+	list_del(&p->thread_group);
+	write_unlock_irq(&tasklist_lock);
+}
+
+/* Protects ->fs, ->files, ->mm, and synchronises with wait4().  Nests inside tasklist_lock */
+static inline void task_lock(struct task_struct *p)
+{
+	spin_lock(&p->alloc_lock);
+}
+
+static inline void task_unlock(struct task_struct *p)
+{
+	spin_unlock(&p->alloc_lock);
+}
+
+/* write full pathname into buffer and return start of pathname */
+static inline char * d_path(struct dentry *dentry, struct vfsmount *vfsmnt,
+				char *buf, int buflen)
+{
+	char *res;
+	struct vfsmount *rootmnt;
+	struct dentry *root;
+	read_lock(&current->fs->lock);
+	rootmnt = mntget(current->fs->rootmnt);
+	root = dget(current->fs->root);
+	read_unlock(&current->fs->lock);
+	spin_lock(&dcache_lock);
+	res = __d_path(dentry, vfsmnt, root, rootmnt, buf, buflen);
+	spin_unlock(&dcache_lock);
+	dput(root);
+	mntput(rootmnt);
+	return res;
+}
+
+static inline int need_resched(void)
+{
+	return (unlikely(current->need_resched));
+}
+
+extern void __cond_resched(void);
+static inline void cond_resched(void)
+{
+	if (need_resched())
+		__cond_resched();
+}
+
+#endif /* __KERNEL__ */
+#endif
diff --git a/xenolinux-2.4.24-sparse/include/linux/timer.h b/xenolinux-2.4.24-sparse/include/linux/timer.h
new file mode 100644
index 0000000000..238083218f
--- /dev/null
+++ b/xenolinux-2.4.24-sparse/include/linux/timer.h
@@ -0,0 +1,77 @@
+#ifndef _LINUX_TIMER_H
+#define _LINUX_TIMER_H
+
+#include <linux/config.h>
+#include <linux/list.h>
+
+/*
+ * In Linux 2.4, static timers have been removed from the kernel.
+ * Timers may be dynamically created and destroyed, and should be initialized
+ * by a call to init_timer() upon creation.
+ *
+ * The "data" field enables use of a common timeout function for several
+ * timeouts. You can use this field to distinguish between the different
+ * invocations.
+ */
+struct timer_list {
+	struct list_head list;
+	unsigned long expires;
+	unsigned long data;
+	void (*function)(unsigned long);
+};
+
+extern void add_timer(struct timer_list * timer);
+extern int del_timer(struct timer_list * timer);
+#ifdef CONFIG_NO_IDLE_HZ
+extern struct timer_list *next_timer_event(void);
+#endif
+
+#ifdef CONFIG_SMP
+extern int del_timer_sync(struct timer_list * timer);
+extern void sync_timers(void);
+#else
+#define del_timer_sync(t)	del_timer(t)
+#define sync_timers()		do { } while (0)
+#endif
+
+/*
+ * mod_timer is a more efficient way to update the expire field of an
+ * active timer (if the timer is inactive it will be activated)
+ * mod_timer(a,b) is equivalent to del_timer(a); a->expires = b; add_timer(a).
+ * If the timer is known to be not pending (ie, in the handler), mod_timer
+ * is less efficient than a->expires = b; add_timer(a).
+ */
+int mod_timer(struct timer_list *timer, unsigned long expires);
+
+extern void it_real_fn(unsigned long);
+
+static inline void init_timer(struct timer_list * timer)
+{
+	timer->list.next = timer->list.prev = NULL;
+}
+
+static inline int timer_pending (const struct timer_list * timer)
+{
+	return timer->list.next != NULL;
+}
+
+/*
+ *	These inlines deal with timer wrapping correctly. You are 
+ *	strongly encouraged to use them
+ *	1. Because people otherwise forget
+ *	2. Because if the timer wrap changes in future you wont have to
+ *	   alter your driver code.
+ *
+ * time_after(a,b) returns true if the time a is after time b.
+ *
+ * Do this with "<0" and ">=0" to only test the sign of the result. A
+ * good compiler would generate better code (and a really good compiler
+ * wouldn't care). Gcc is currently neither.
+ */
+#define time_after(a,b)		((long)(b) - (long)(a) < 0)
+#define time_before(a,b)	time_after(b,a)
+
+#define time_after_eq(a,b)	((long)(a) - (long)(b) >= 0)
+#define time_before_eq(a,b)	time_after_eq(b,a)
+
+#endif
diff --git a/xenolinux-2.4.24-sparse/kernel/panic.c b/xenolinux-2.4.24-sparse/kernel/panic.c
index 871ea67fee..6ab619a607 100644
--- a/xenolinux-2.4.24-sparse/kernel/panic.c
+++ b/xenolinux-2.4.24-sparse/kernel/panic.c
@@ -110,7 +110,8 @@ NORET_TYPE void panic(const char * fmt, ...)
 #endif
 		CHECK_EMERGENCY_SYNC
 #if defined(CONFIG_XENO)
-                HYPERVISOR_exit();
+		HYPERVISOR_console_write(buf, strlen(buf));
+		HYPERVISOR_exit();
 #endif
 	}
 }
diff --git a/xenolinux-2.4.24-sparse/kernel/timer.c b/xenolinux-2.4.24-sparse/kernel/timer.c
new file mode 100644
index 0000000000..567794ab26
--- /dev/null
+++ b/xenolinux-2.4.24-sparse/kernel/timer.c
@@ -0,0 +1,968 @@
+/*
+ *  linux/kernel/timer.c
+ *
+ *  Kernel internal timers, kernel timekeeping, basic process system calls
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  1997-01-28  Modified by Finn Arne Gangstad to make timers scale better.
+ *
+ *  1997-09-10  Updated NTP code according to technical memorandum Jan '96
+ *              "A Kernel Model for Precision Timekeeping" by Dave Mills
+ *  1998-12-24  Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
+ *              serialize accesses to xtime/lost_ticks).
+ *                              Copyright (C) 1998  Andrea Arcangeli
+ *  1999-03-10  Improved NTP compatibility by Ulrich Windl
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/timex.h>
+#include <linux/delay.h>
+#include <linux/smp_lock.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+
+#include <asm/uaccess.h>
+
+/*
+ * Timekeeping variables
+ */
+
+long tick = (1000000 + HZ/2) / HZ;	/* timer interrupt period */
+
+/* The current time */
+struct timeval xtime __attribute__ ((aligned (16)));
+
+/* Don't completely fail for HZ > 500.  */
+int tickadj = 500/HZ ? : 1;		/* microsecs */
+
+DECLARE_TASK_QUEUE(tq_timer);
+DECLARE_TASK_QUEUE(tq_immediate);
+
+/*
+ * phase-lock loop variables
+ */
+/* TIME_ERROR prevents overwriting the CMOS clock */
+int time_state = TIME_OK;		/* clock synchronization status	*/
+int time_status = STA_UNSYNC;		/* clock status bits		*/
+long time_offset;			/* time adjustment (us)		*/
+long time_constant = 2;			/* pll time constant		*/
+long time_tolerance = MAXFREQ;		/* frequency tolerance (ppm)	*/
+long time_precision = 1;		/* clock precision (us)		*/
+long time_maxerror = NTP_PHASE_LIMIT;	/* maximum error (us)		*/
+long time_esterror = NTP_PHASE_LIMIT;	/* estimated error (us)		*/
+long time_phase;			/* phase offset (scaled us)	*/
+long time_freq = ((1000000 + HZ/2) % HZ - HZ/2) << SHIFT_USEC;
+					/* frequency offset (scaled ppm)*/
+long time_adj;				/* tick adjust (scaled 1 / HZ)	*/
+long time_reftime;			/* time at last adjustment (s)	*/
+
+long time_adjust;
+long time_adjust_step;
+
+unsigned long event;
+
+extern int do_setitimer(int, struct itimerval *, struct itimerval *);
+
+unsigned long volatile jiffies;
+
+unsigned int * prof_buffer;
+unsigned long prof_len;
+unsigned long prof_shift;
+
+/*
+ * Event timer code
+ */
+#define TVN_BITS 6
+#define TVR_BITS 8
+#define TVN_SIZE (1 << TVN_BITS)
+#define TVR_SIZE (1 << TVR_BITS)
+#define TVN_MASK (TVN_SIZE - 1)
+#define TVR_MASK (TVR_SIZE - 1)
+
+struct timer_vec {
+	int index;
+	struct list_head vec[TVN_SIZE];
+};
+
+struct timer_vec_root {
+	int index;
+	struct list_head vec[TVR_SIZE];
+};
+
+static struct timer_vec tv5;
+static struct timer_vec tv4;
+static struct timer_vec tv3;
+static struct timer_vec tv2;
+static struct timer_vec_root tv1;
+
+static struct timer_vec * const tvecs[] = {
+	(struct timer_vec *)&tv1, &tv2, &tv3, &tv4, &tv5
+};
+
+static struct list_head * run_timer_list_running;
+
+#define NOOF_TVECS (sizeof(tvecs) / sizeof(tvecs[0]))
+
+void init_timervecs (void)
+{
+	int i;
+
+	for (i = 0; i < TVN_SIZE; i++) {
+		INIT_LIST_HEAD(tv5.vec + i);
+		INIT_LIST_HEAD(tv4.vec + i);
+		INIT_LIST_HEAD(tv3.vec + i);
+		INIT_LIST_HEAD(tv2.vec + i);
+	}
+	for (i = 0; i < TVR_SIZE; i++)
+		INIT_LIST_HEAD(tv1.vec + i);
+}
+
+static unsigned long timer_jiffies;
+
+static inline void internal_add_timer(struct timer_list *timer)
+{
+	/*
+	 * must be cli-ed when calling this
+	 */
+	unsigned long expires = timer->expires;
+	unsigned long idx = expires - timer_jiffies;
+	struct list_head * vec;
+
+	if (run_timer_list_running)
+		vec = run_timer_list_running;
+	else if (idx < TVR_SIZE) {
+		int i = expires & TVR_MASK;
+		vec = tv1.vec + i;
+	} else if (idx < 1 << (TVR_BITS + TVN_BITS)) {
+		int i = (expires >> TVR_BITS) & TVN_MASK;
+		vec = tv2.vec + i;
+	} else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) {
+		int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK;
+		vec =  tv3.vec + i;
+	} else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) {
+		int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK;
+		vec = tv4.vec + i;
+	} else if ((signed long) idx < 0) {
+		/* can happen if you add a timer with expires == jiffies,
+		 * or you set a timer to go off in the past
+		 */
+		vec = tv1.vec + tv1.index;
+	} else if (idx <= 0xffffffffUL) {
+		int i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
+		vec = tv5.vec + i;
+	} else {
+		/* Can only get here on architectures with 64-bit jiffies */
+		INIT_LIST_HEAD(&timer->list);
+		return;
+	}
+	/*
+	 * Timers are FIFO!
+	 */
+	list_add(&timer->list, vec->prev);
+}
+
+/* Initialize both explicitly - let's try to have them in the same cache line */
+spinlock_t timerlist_lock = SPIN_LOCK_UNLOCKED;
+
+#ifdef CONFIG_SMP
+volatile struct timer_list * volatile running_timer;
+#define timer_enter(t) do { running_timer = t; mb(); } while (0)
+#define timer_exit() do { running_timer = NULL; } while (0)
+#define timer_is_running(t) (running_timer == t)
+#define timer_synchronize(t) while (timer_is_running(t)) barrier()
+#else
+#define timer_enter(t)		do { } while (0)
+#define timer_exit()		do { } while (0)
+#endif
+
+void add_timer(struct timer_list *timer)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&timerlist_lock, flags);
+	if (timer_pending(timer))
+		goto bug;
+	internal_add_timer(timer);
+	spin_unlock_irqrestore(&timerlist_lock, flags);
+	return;
+bug:
+	spin_unlock_irqrestore(&timerlist_lock, flags);
+	printk("bug: kernel timer added twice at %p.\n",
+			__builtin_return_address(0));
+}
+
+static inline int detach_timer (struct timer_list *timer)
+{
+	if (!timer_pending(timer))
+		return 0;
+	list_del(&timer->list);
+	return 1;
+}
+
+int mod_timer(struct timer_list *timer, unsigned long expires)
+{
+	int ret;
+	unsigned long flags;
+
+	spin_lock_irqsave(&timerlist_lock, flags);
+	timer->expires = expires;
+	ret = detach_timer(timer);
+	internal_add_timer(timer);
+	spin_unlock_irqrestore(&timerlist_lock, flags);
+	return ret;
+}
+
+int del_timer(struct timer_list * timer)
+{
+	int ret;
+	unsigned long flags;
+
+	spin_lock_irqsave(&timerlist_lock, flags);
+	ret = detach_timer(timer);
+	timer->list.next = timer->list.prev = NULL;
+	spin_unlock_irqrestore(&timerlist_lock, flags);
+	return ret;
+}
+
+#ifdef CONFIG_SMP
+void sync_timers(void)
+{
+	spin_unlock_wait(&global_bh_lock);
+}
+
+/*
+ * SMP specific function to delete periodic timer.
+ * Caller must disable by some means restarting the timer
+ * for new. Upon exit the timer is not queued and handler is not running
+ * on any CPU. It returns number of times, which timer was deleted
+ * (for reference counting).
+ */
+
+int del_timer_sync(struct timer_list * timer)
+{
+	int ret = 0;
+
+	for (;;) {
+		unsigned long flags;
+		int running;
+
+		spin_lock_irqsave(&timerlist_lock, flags);
+		ret += detach_timer(timer);
+		timer->list.next = timer->list.prev = 0;
+		running = timer_is_running(timer);
+		spin_unlock_irqrestore(&timerlist_lock, flags);
+
+		if (!running)
+			break;
+
+		timer_synchronize(timer);
+	}
+
+	return ret;
+}
+#endif
+
+
+static inline void cascade_timers(struct timer_vec *tv)
+{
+	/* cascade all the timers from tv up one level */
+	struct list_head *head, *curr, *next;
+
+	head = tv->vec + tv->index;
+	curr = head->next;
+	/*
+	 * We are removing _all_ timers from the list, so we don't  have to
+	 * detach them individually, just clear the list afterwards.
+	 */
+	while (curr != head) {
+		struct timer_list *tmp;
+
+		tmp = list_entry(curr, struct timer_list, list);
+		next = curr->next;
+		list_del(curr); // not needed
+		internal_add_timer(tmp);
+		curr = next;
+	}
+	INIT_LIST_HEAD(head);
+	tv->index = (tv->index + 1) & TVN_MASK;
+}
+
+static inline void run_timer_list(void)
+{
+	spin_lock_irq(&timerlist_lock);
+	while ((long)(jiffies - timer_jiffies) >= 0) {
+		LIST_HEAD(queued);
+		struct list_head *head, *curr;
+		if (!tv1.index) {
+			int n = 1;
+			do {
+				cascade_timers(tvecs[n]);
+			} while (tvecs[n]->index == 1 && ++n < NOOF_TVECS);
+		}
+		run_timer_list_running = &queued;
+repeat:
+		head = tv1.vec + tv1.index;
+		curr = head->next;
+		if (curr != head) {
+			struct timer_list *timer;
+			void (*fn)(unsigned long);
+			unsigned long data;
+
+			timer = list_entry(curr, struct timer_list, list);
+ 			fn = timer->function;
+ 			data= timer->data;
+
+			detach_timer(timer);
+			timer->list.next = timer->list.prev = NULL;
+			timer_enter(timer);
+			spin_unlock_irq(&timerlist_lock);
+			fn(data);
+			spin_lock_irq(&timerlist_lock);
+			timer_exit();
+			goto repeat;
+		}
+		run_timer_list_running = NULL;
+		++timer_jiffies; 
+		tv1.index = (tv1.index + 1) & TVR_MASK;
+
+		curr = queued.next;
+		while (curr != &queued) {
+			struct timer_list *timer;
+
+			timer = list_entry(curr, struct timer_list, list);
+			curr = curr->next;
+			internal_add_timer(timer);
+		}			
+	}
+	spin_unlock_irq(&timerlist_lock);
+}
+
+#ifdef CONFIG_NO_IDLE_HZ
+/*
+ * Find out when the next timer event is due to happen. This
+ * is used on S/390 to stop all activity when all cpus are idle.
+ * And in XenoLinux to achieve the same.
+ * The timerlist_lock must be acquired before calling this function.
+ */
+struct timer_list *next_timer_event(void)
+{
+	struct timer_list *nte, *tmp;
+	struct list_head *lst;
+	int i, j;
+
+	/* Look for the next timer event in tv1. */
+	i = 0;
+	j = tvecs[0]->index;
+	do {
+		struct list_head *head = tvecs[0]->vec + j;
+		if (!list_empty(head)) {
+			nte = list_entry(head->next, struct timer_list, list);
+			goto found;
+		}
+		j = (j + 1) & TVR_MASK;
+	} while (j != tv1.index);
+
+	/* No event found in tv1. Check tv2-tv5. */
+	for (i = 1; i < NOOF_TVECS; i++) {
+		j = tvecs[i]->index;
+		do {
+			nte = NULL;
+			list_for_each(lst, tvecs[i]->vec + j) {
+				tmp = list_entry(lst, struct timer_list, list);
+				if (nte == NULL ||
+				    time_before(tmp->expires, nte->expires))
+					nte = tmp;
+			}
+			if (nte)
+				goto found;
+			j = (j + 1) & TVN_MASK;
+		} while (j != tvecs[i]->index);
+	}
+	return NULL;
+found:
+	/* Found timer event in tvecs[i]->vec[j] */
+	if (j < tvecs[i]->index && i < NOOF_TVECS-1) {
+		/* 
+		 * The search wrapped. We need to look at the next list
+		 * from tvecs[i+1] that would cascade into tvecs[i].
+		 */
+		list_for_each(lst, tvecs[i+1]->vec+tvecs[i+1]->index) {
+			tmp = list_entry(lst, struct timer_list, list);
+			if (time_before(tmp->expires, nte->expires))
+				nte = tmp;
+		}
+	}
+	return nte;
+}
+#endif
+
+spinlock_t tqueue_lock = SPIN_LOCK_UNLOCKED;
+
+void tqueue_bh(void)
+{
+	run_task_queue(&tq_timer);
+}
+
+void immediate_bh(void)
+{
+	run_task_queue(&tq_immediate);
+}
+
+/*
+ * this routine handles the overflow of the microsecond field
+ *
+ * The tricky bits of code to handle the accurate clock support
+ * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
+ * They were originally developed for SUN and DEC kernels.
+ * All the kudos should go to Dave for this stuff.
+ *
+ */
+static void second_overflow(void)
+{
+    long ltemp;
+
+    /* Bump the maxerror field */
+    time_maxerror += time_tolerance >> SHIFT_USEC;
+    if ( time_maxerror > NTP_PHASE_LIMIT ) {
+	time_maxerror = NTP_PHASE_LIMIT;
+	time_status |= STA_UNSYNC;
+    }
+
+    /*
+     * Leap second processing. If in leap-insert state at
+     * the end of the day, the system clock is set back one
+     * second; if in leap-delete state, the system clock is
+     * set ahead one second. The microtime() routine or
+     * external clock driver will insure that reported time
+     * is always monotonic. The ugly divides should be
+     * replaced.
+     */
+    switch (time_state) {
+
+    case TIME_OK:
+	if (time_status & STA_INS)
+	    time_state = TIME_INS;
+	else if (time_status & STA_DEL)
+	    time_state = TIME_DEL;
+	break;
+
+    case TIME_INS:
+	if (xtime.tv_sec % 86400 == 0) {
+	    xtime.tv_sec--;
+	    time_state = TIME_OOP;
+	    printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n");
+	}
+	break;
+
+    case TIME_DEL:
+	if ((xtime.tv_sec + 1) % 86400 == 0) {
+	    xtime.tv_sec++;
+	    time_state = TIME_WAIT;
+	    printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n");
+	}
+	break;
+
+    case TIME_OOP:
+	time_state = TIME_WAIT;
+	break;
+
+    case TIME_WAIT:
+	if (!(time_status & (STA_INS | STA_DEL)))
+	    time_state = TIME_OK;
+    }
+
+    /*
+     * Compute the phase adjustment for the next second. In
+     * PLL mode, the offset is reduced by a fixed factor
+     * times the time constant. In FLL mode the offset is
+     * used directly. In either mode, the maximum phase
+     * adjustment for each second is clamped so as to spread
+     * the adjustment over not more than the number of
+     * seconds between updates.
+     */
+    if (time_offset < 0) {
+	ltemp = -time_offset;
+	if (!(time_status & STA_FLL))
+	    ltemp >>= SHIFT_KG + time_constant;
+	if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
+	    ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
+	time_offset += ltemp;
+	time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
+    } else {
+	ltemp = time_offset;
+	if (!(time_status & STA_FLL))
+	    ltemp >>= SHIFT_KG + time_constant;
+	if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
+	    ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
+	time_offset -= ltemp;
+	time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
+    }
+
+    /*
+     * Compute the frequency estimate and additional phase
+     * adjustment due to frequency error for the next
+     * second. When the PPS signal is engaged, gnaw on the
+     * watchdog counter and update the frequency computed by
+     * the pll and the PPS signal.
+     */
+    pps_valid++;
+    if (pps_valid == PPS_VALID) {	/* PPS signal lost */
+	pps_jitter = MAXTIME;
+	pps_stabil = MAXFREQ;
+	time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
+			 STA_PPSWANDER | STA_PPSERROR);
+    }
+    ltemp = time_freq + pps_freq;
+    if (ltemp < 0)
+	time_adj -= -ltemp >>
+	    (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
+    else
+	time_adj += ltemp >>
+	    (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
+
+#if HZ == 100
+    /* Compensate for (HZ==100) != (1 << SHIFT_HZ).
+     * Add 25% and 3.125% to get 128.125; => only 0.125% error (p. 14)
+     */
+    if (time_adj < 0)
+	time_adj -= (-time_adj >> 2) + (-time_adj >> 5);
+    else
+	time_adj += (time_adj >> 2) + (time_adj >> 5);
+#endif
+}
+
+/* in the NTP reference this is called "hardclock()" */
+static void update_wall_time_one_tick(void)
+{
+	if ( (time_adjust_step = time_adjust) != 0 ) {
+	    /* We are doing an adjtime thing. 
+	     *
+	     * Prepare time_adjust_step to be within bounds.
+	     * Note that a positive time_adjust means we want the clock
+	     * to run faster.
+	     *
+	     * Limit the amount of the step to be in the range
+	     * -tickadj .. +tickadj
+	     */
+	     if (time_adjust > tickadj)
+		time_adjust_step = tickadj;
+	     else if (time_adjust < -tickadj)
+		time_adjust_step = -tickadj;
+	     
+	    /* Reduce by this step the amount of time left  */
+	    time_adjust -= time_adjust_step;
+	}
+	xtime.tv_usec += tick + time_adjust_step;
+	/*
+	 * Advance the phase, once it gets to one microsecond, then
+	 * advance the tick more.
+	 */
+	time_phase += time_adj;
+	if (time_phase <= -FINEUSEC) {
+		long ltemp = -time_phase >> SHIFT_SCALE;
+		time_phase += ltemp << SHIFT_SCALE;
+		xtime.tv_usec -= ltemp;
+	}
+	else if (time_phase >= FINEUSEC) {
+		long ltemp = time_phase >> SHIFT_SCALE;
+		time_phase -= ltemp << SHIFT_SCALE;
+		xtime.tv_usec += ltemp;
+	}
+}
+
+/*
+ * Using a loop looks inefficient, but "ticks" is
+ * usually just one (we shouldn't be losing ticks,
+ * we're doing this this way mainly for interrupt
+ * latency reasons, not because we think we'll
+ * have lots of lost timer ticks
+ */
+static void update_wall_time(unsigned long ticks)
+{
+	do {
+		ticks--;
+		update_wall_time_one_tick();
+	} while (ticks);
+
+	while (xtime.tv_usec >= 1000000) {
+	    xtime.tv_usec -= 1000000;
+	    xtime.tv_sec++;
+	    second_overflow();
+	}
+}
+
+static inline void do_process_times(struct task_struct *p,
+	unsigned long user, unsigned long system)
+{
+	unsigned long psecs;
+
+	psecs = (p->times.tms_utime += user);
+	psecs += (p->times.tms_stime += system);
+	if (psecs / HZ > p->rlim[RLIMIT_CPU].rlim_cur) {
+		/* Send SIGXCPU every second.. */
+		if (!(psecs % HZ))
+			send_sig(SIGXCPU, p, 1);
+		/* and SIGKILL when we go over max.. */
+		if (psecs / HZ > p->rlim[RLIMIT_CPU].rlim_max)
+			send_sig(SIGKILL, p, 1);
+	}
+}
+
+static inline void do_it_virt(struct task_struct * p, unsigned long ticks)
+{
+	unsigned long it_virt = p->it_virt_value;
+
+	if (it_virt) {
+		it_virt -= ticks;
+		if (!it_virt) {
+			it_virt = p->it_virt_incr;
+			send_sig(SIGVTALRM, p, 1);
+		}
+		p->it_virt_value = it_virt;
+	}
+}
+
+static inline void do_it_prof(struct task_struct *p)
+{
+	unsigned long it_prof = p->it_prof_value;
+
+	if (it_prof) {
+		if (--it_prof == 0) {
+			it_prof = p->it_prof_incr;
+			send_sig(SIGPROF, p, 1);
+		}
+		p->it_prof_value = it_prof;
+	}
+}
+
+void update_one_process(struct task_struct *p, unsigned long user,
+			unsigned long system, int cpu)
+{
+	p->per_cpu_utime[cpu] += user;
+	p->per_cpu_stime[cpu] += system;
+	do_process_times(p, user, system);
+	do_it_virt(p, user);
+	do_it_prof(p);
+}	
+
+/*
+ * Called from the timer interrupt handler to charge one tick to the current 
+ * process.  user_tick is 1 if the tick is user time, 0 for system.
+ */
+void update_process_times(int user_tick)
+{
+	struct task_struct *p = current;
+	int cpu = smp_processor_id(), system = user_tick ^ 1;
+
+	update_one_process(p, user_tick, system, cpu);
+	if (p->pid) {
+		if (--p->counter <= 0) {
+			p->counter = 0;
+			/*
+			 * SCHED_FIFO is priority preemption, so this is 
+			 * not the place to decide whether to reschedule a
+			 * SCHED_FIFO task or not - Bhavesh Davda
+			 */
+			if (p->policy != SCHED_FIFO) {
+				p->need_resched = 1;
+			}
+		}
+		if (p->nice > 0)
+			kstat.per_cpu_nice[cpu] += user_tick;
+		else
+			kstat.per_cpu_user[cpu] += user_tick;
+		kstat.per_cpu_system[cpu] += system;
+	} else if (local_bh_count(cpu) || local_irq_count(cpu) > 1)
+		kstat.per_cpu_system[cpu] += system;
+}
+
+/*
+ * Called from the timer interrupt handler to charge a couple of ticks
+ * to the current process.
+ */
+void update_process_times_us(int user_ticks, int system_ticks)
+{
+	struct task_struct *p = current;
+	int cpu = smp_processor_id();
+
+	update_one_process(p, user_ticks, system_ticks, cpu);
+	if (p->pid) {
+		p->counter -= user_ticks + system_ticks;
+		if (p->counter <= 0) {
+			p->counter = 0;
+			p->need_resched = 1;
+		}
+		if (p->nice > 0)
+			kstat.per_cpu_nice[cpu] += user_ticks;
+		else
+			kstat.per_cpu_user[cpu] += user_ticks;
+		kstat.per_cpu_system[cpu] += system_ticks;
+	} else if (local_bh_count(cpu) || local_irq_count(cpu) > 1)
+		kstat.per_cpu_system[cpu] += system_ticks;
+}
+
+/*
+ * Nr of active tasks - counted in fixed-point numbers
+ */
+static unsigned long count_active_tasks(void)
+{
+	struct task_struct *p;
+	unsigned long nr = 0;
+
+	read_lock(&tasklist_lock);
+	for_each_task(p) {
+		if ((p->state == TASK_RUNNING ||
+		     (p->state & TASK_UNINTERRUPTIBLE)))
+			nr += FIXED_1;
+	}
+	read_unlock(&tasklist_lock);
+	return nr;
+}
+
+/*
+ * Hmm.. Changed this, as the GNU make sources (load.c) seems to
+ * imply that avenrun[] is the standard name for this kind of thing.
+ * Nothing else seems to be standardized: the fractional size etc
+ * all seem to differ on different machines.
+ */
+unsigned long avenrun[3];
+
+static inline void calc_load(unsigned long ticks)
+{
+	unsigned long active_tasks; /* fixed-point */
+	static int count = LOAD_FREQ;
+
+	count -= ticks;
+	while (count < 0) {
+		count += LOAD_FREQ;
+		active_tasks = count_active_tasks();
+		CALC_LOAD(avenrun[0], EXP_1, active_tasks);
+		CALC_LOAD(avenrun[1], EXP_5, active_tasks);
+		CALC_LOAD(avenrun[2], EXP_15, active_tasks);
+	}
+}
+
+/* jiffies at the most recent update of wall time */
+unsigned long wall_jiffies;
+
+/*
+ * This spinlock protect us from races in SMP while playing with xtime. -arca
+ */
+rwlock_t xtime_lock = RW_LOCK_UNLOCKED;
+
+static inline void update_times(void)
+{
+	unsigned long ticks;
+
+	/*
+	 * update_times() is run from the raw timer_bh handler so we
+	 * just know that the irqs are locally enabled and so we don't
+	 * need to save/restore the flags of the local CPU here. -arca
+	 */
+	write_lock_irq(&xtime_lock);
+	vxtime_lock();
+
+	ticks = jiffies - wall_jiffies;
+	if (ticks) {
+		wall_jiffies += ticks;
+		update_wall_time(ticks);
+	}
+	vxtime_unlock();
+	write_unlock_irq(&xtime_lock);
+	calc_load(ticks);
+}
+
+void timer_bh(void)
+{
+	update_times();
+	run_timer_list();
+}
+
+void do_timer(struct pt_regs *regs)
+{
+	(*(unsigned long *)&jiffies)++;
+#ifndef CONFIG_SMP
+	/* SMP process accounting uses the local APIC timer */
+
+	update_process_times(user_mode(regs));
+#endif
+	mark_bh(TIMER_BH);
+	if (TQ_ACTIVE(tq_timer))
+		mark_bh(TQUEUE_BH);
+}
+
+void do_timer_ticks(int ticks)
+{
+	(*(unsigned long *)&jiffies) += ticks;
+	mark_bh(TIMER_BH);
+	if (TQ_ACTIVE(tq_timer))
+		mark_bh(TQUEUE_BH);
+}
+
+#if !defined(__alpha__) && !defined(__ia64__)
+
+/*
+ * For backwards compatibility?  This can be done in libc so Alpha
+ * and all newer ports shouldn't need it.
+ */
+asmlinkage unsigned long sys_alarm(unsigned int seconds)
+{
+	struct itimerval it_new, it_old;
+	unsigned int oldalarm;
+
+	it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
+	it_new.it_value.tv_sec = seconds;
+	it_new.it_value.tv_usec = 0;
+	do_setitimer(ITIMER_REAL, &it_new, &it_old);
+	oldalarm = it_old.it_value.tv_sec;
+	/* ehhh.. We can't return 0 if we have an alarm pending.. */
+	/* And we'd better return too much than too little anyway */
+	if (it_old.it_value.tv_usec)
+		oldalarm++;
+	return oldalarm;
+}
+
+#endif
+
+#ifndef __alpha__
+
+/*
+ * The Alpha uses getxpid, getxuid, and getxgid instead.  Maybe this
+ * should be moved into arch/i386 instead?
+ */
+
+/**
+ * sys_getpid - return the thread group id of the current process
+ *
+ * Note, despite the name, this returns the tgid not the pid.  The tgid and
+ * the pid are identical unless CLONE_THREAD was specified on clone() in
+ * which case the tgid is the same in all threads of the same group.
+ *
+ * This is SMP safe as current->tgid does not change.
+ */
+asmlinkage long sys_getpid(void)
+{
+	return current->tgid;
+}
+
+/*
+ * This is not strictly SMP safe: p_opptr could change
+ * from under us. However, rather than getting any lock
+ * we can use an optimistic algorithm: get the parent
+ * pid, and go back and check that the parent is still
+ * the same. If it has changed (which is extremely unlikely
+ * indeed), we just try again..
+ *
+ * NOTE! This depends on the fact that even if we _do_
+ * get an old value of "parent", we can happily dereference
+ * the pointer: we just can't necessarily trust the result
+ * until we know that the parent pointer is valid.
+ *
+ * The "mb()" macro is a memory barrier - a synchronizing
+ * event. It also makes sure that gcc doesn't optimize
+ * away the necessary memory references.. The barrier doesn't
+ * have to have all that strong semantics: on x86 we don't
+ * really require a synchronizing instruction, for example.
+ * The barrier is more important for code generation than
+ * for any real memory ordering semantics (even if there is
+ * a small window for a race, using the old pointer is
+ * harmless for a while).
+ */
+asmlinkage long sys_getppid(void)
+{
+	int pid;
+	struct task_struct * me = current;
+	struct task_struct * parent;
+
+	parent = me->p_opptr;
+	for (;;) {
+		pid = parent->pid;
+#if CONFIG_SMP
+{
+		struct task_struct *old = parent;
+		mb();
+		parent = me->p_opptr;
+		if (old != parent)
+			continue;
+}
+#endif
+		break;
+	}
+	return pid;
+}
+
+asmlinkage long sys_getuid(void)
+{
+	/* Only we change this so SMP safe */
+	return current->uid;
+}
+
+asmlinkage long sys_geteuid(void)
+{
+	/* Only we change this so SMP safe */
+	return current->euid;
+}
+
+asmlinkage long sys_getgid(void)
+{
+	/* Only we change this so SMP safe */
+	return current->gid;
+}
+
+asmlinkage long sys_getegid(void)
+{
+	/* Only we change this so SMP safe */
+	return  current->egid;
+}
+
+#endif
+
+/* Thread ID - the internal kernel "pid" */
+asmlinkage long sys_gettid(void)
+{
+	return current->pid;
+}
+
+asmlinkage long sys_nanosleep(struct timespec *rqtp, struct timespec *rmtp)
+{
+	struct timespec t;
+	unsigned long expire;
+
+	if(copy_from_user(&t, rqtp, sizeof(struct timespec)))
+		return -EFAULT;
+
+	if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0 || t.tv_sec < 0)
+		return -EINVAL;
+
+
+	if (t.tv_sec == 0 && t.tv_nsec <= 2000000L &&
+	    current->policy != SCHED_OTHER)
+	{
+		/*
+		 * Short delay requests up to 2 ms will be handled with
+		 * high precision by a busy wait for all real-time processes.
+		 *
+		 * Its important on SMP not to do this holding locks.
+		 */
+		udelay((t.tv_nsec + 999) / 1000);
+		return 0;
+	}
+
+	expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec);
+
+	current->state = TASK_INTERRUPTIBLE;
+	expire = schedule_timeout(expire);
+
+	if (expire) {
+		if (rmtp) {
+			jiffies_to_timespec(expire, &t);
+			if (copy_to_user(rmtp, &t, sizeof(struct timespec)))
+				return -EFAULT;
+		}
+		return -EINTR;
+	}
+	return 0;
+}
+
-- 
2.30.2